def main(args, outs): outs.coerce_strings() paired_end = cr_chem.is_paired_end(args.chemistry_def) outs.read1s = martian.make_path('reads_1.fastq' + h5_constants.LZ4_SUFFIX) r1_fq_out = cr_io.open_maybe_gzip(outs.read1s, 'w') if paired_end: outs.read2s = martian.make_path('reads_2.fastq' + h5_constants.LZ4_SUFFIX) r2_fq_out = cr_io.open_maybe_gzip(outs.read2s, 'w') else: outs.read2s = None r2_fq_out = None barcodes_out = cr_io.open_maybe_gzip(outs.chunk_barcodes, 'w') merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out, paired_end) r1_fq_out.close() if r2_fq_out is not None: r2_fq_out.close() barcodes_out.close()
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end chunk_index = args.chunk_index prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0)) bams_out = {} for prefix in prefixes: filename = martian.make_path("bc_{}.bam".format(prefix)) bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template) non_bc_bam = martian.make_path("bc_{}.bam".format(None)) non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template) for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)): barcode = crdna_io.get_read_barcode(read) if barcode is None: non_bc_bam_out.write(read) else: prefix = barcode[:args.nbases] bams_out[prefix].write(read) bam_in.close() non_bc_bam_out.close() sort_bam(non_bc_bam) outs.non_bc_bams = [non_bc_bam] outs.buckets = {} for prefix in prefixes: filename = bams_out[prefix].filename bams_out[prefix].close() sort_bam(filename) outs.buckets[prefix] = filename
def main(args, outs): """ Wrapper for GATK """ tmp = martian.make_path('tmp.vcf') ref = os.path.join(args.reference_path, 'fasta', 'genome.fa') cmd = [ 'java', '-jar', args.gatk_path, 'HaplotypeCaller', '-R', ref, '-L', args.targets_file, '-I', args.subset_bam, '-O', tmp, '--native-pair-hmm-threads', '1' ] subprocess.check_call(cmd) # fix the name tmp2 = martian.make_path('{}.vcf'.format( args.node_id)) # node ID in original tree with open(tmp2, 'w') as outf: for l in open(tmp): if l.startswith('#CHROM'): l = l.split() l[-1] = args.node_id l = '\t'.join(l) + '\n' outf.write(l) subprocess.check_call(['bgzip', tmp2]) subprocess.check_call(['tabix', '-p', 'vcf', tmp2 + '.gz']) outs.subset_variants = tmp2 + '.gz'
def main(args, outs): args.coerce_strings() tmp_bam = martian.make_path(str(args.cluster_id) + '.unsorted.bam') tk_bam.concatenate(tmp_bam, args.cluster_bams) outs.merged_bam = martian.make_path('{}.bam'.format(args.cluster_id)) subprocess.check_call([ 'sambamba', 'sort', '-t', str(args.__threads), '-o', outs.merged_bam, tmp_bam ]) os.remove(tmp_bam)
def get_dummy_chunk(): read1_out_filename = martian.make_path('chunk0_1.fastq') read2_out_filename = martian.make_path('chunk0_2.fastq') with open(read1_out_filename, 'w'), open(read2_out_filename, 'w'): pass chunks = [{ 'read1_chunk': read1_out_filename, 'read2_chunk': read2_out_filename, 'barcodes_chunk': None, }] return {'chunks': chunks}
def main(args, outs): outs.coerce_strings() # Note: This naming scheme is required by FILTER_VDJ_READS / vdj_asm outs.read1s = martian.make_path('reads_1.fastq') outs.read2s = martian.make_path('reads_2.fastq') with open(outs.read1s, 'w') as r1_fq_out, \ open(outs.read2s, 'w') as r2_fq_out, \ open(outs.chunk_barcodes, 'w') as barcodes_out: merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out)
def main(args, outs): run_assembly(args.chunked_bam, martian.make_path(''), args) out_pref = os.path.splitext(os.path.basename(args.chunked_bam))[0] out_pref = martian.make_path(out_pref) cr_io.move(out_pref + '.fasta', outs.contig_fasta) cr_io.move(out_pref + '.fastq', outs.contig_fastq) cr_io.move(out_pref + '_summary.tsv', outs.summary_tsv) cr_io.move(out_pref + '_umi_summary.tsv', outs.umi_summary_tsv) cr_io.move(out_pref + '_sorted.bam', outs.contig_bam) cr_io.move(out_pref + '_sorted.bam.bai', outs.contig_bam_bai) cr_io.move(out_pref + '_metrics_summary.json', outs.metrics_summary_json)
def prepare_transcriptome_indexes(reference_path, vdj_reference_path): """ Use ReadStates of R1/R2 to determine SC3Pv1 vs SC3Pv2 vs SC5P-R1 vs SC5P_auto/SCVDJ. Returns (chemistry_name, report, metrics) where report is a text report and metrics is a dict """ ## Index the reference fasta fa_path = os.path.join(reference_path, cr_constants.REFERENCE_FASTA_PATH) new_fa_path = martian.make_path('ref.fa') need_index = True if os.path.exists(fa_path + '.fai'): # Look for existing .fai file (won't exist for our standard ref packages) martian.update_progress('Found genome FASTA index....') new_fa_path = fa_path need_index = False else: # Note: this will fail if user's fs doesn't support symlinks martian.update_progress('Symlinking genome FASTA...') os.symlink(fa_path, new_fa_path) if need_index: martian.update_progress('Indexing genome...') run(['samtools', 'faidx', new_fa_path]) ## Generate a transcriptome reference from a genome ref martian.update_progress('Building transcriptome...') gtf_path = os.path.join(reference_path, cr_constants.REFERENCE_GENES_GTF_PATH) out_fa_path = martian.make_path('transcriptome.fa') # Only index the 1st encountered transcript per gene run([ 'detect_chemistry', 'get-transcripts', new_fa_path, gtf_path, out_fa_path ]) ## Build kmer index martian.update_progress('Building kmer index...') kmer_idx_path = martian.make_path('kmers.idx') run(['detect_chemistry', 'index-transcripts', out_fa_path, kmer_idx_path]) # Build VDJ kmer index (optional) vdj_idx_path = None if vdj_reference_path is not None: vdj_fa_path = vdj_ref.get_vdj_reference_fasta(vdj_reference_path) vdj_idx_path = martian.make_path('vdj_kmers.idx') run([ 'detect_chemistry', 'index-transcripts', vdj_fa_path, vdj_idx_path ]) return (kmer_idx_path, vdj_idx_path)
def split(args): assert args.read1s is not None and args.read2s is not None chunks = [] if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None: # Data are barcoded for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s, args.chunk_barcodes): with open(barcodes_json) as f: chunk_barcodes = json.load(f) chunks.append({ 'read1_chunk': read1_fq, 'read2_chunk': read2_fq, 'barcodes_chunk': chunk_barcodes, '__mem_gb': 3.0, }) else: # Most stages assume that each chunk has a single barcode. # So unfortunately we have to put all reads in the same chunk, otherwise # metric computation will break. read1_out_filename = martian.make_path('chunk0_1.fastq') read2_out_filename = martian.make_path('chunk0_2.fastq') with open(read1_out_filename, 'w') as read1_out, open(read2_out_filename, 'w') as read2_out: for read1_file, read2_file in zip(args.read1s, args.read2s): with open(read1_file) as in1, open(read2_file) as in2: fastq1_iter = tk_fasta.read_generator_fastq( in1, paired_end=False) fastq2_iter = tk_fasta.read_generator_fastq( in2, paired_end=False) for read1_tuple in fastq1_iter: read2_tuple = fastq2_iter.next() tk_fasta.write_read_fastq(read1_out, *read1_tuple) tk_fasta.write_read_fastq(read2_out, *read2_tuple) chunks.append({ 'read1_chunk': read1_out_filename, 'read2_chunk': read2_out_filename, 'barcodes_chunk': [""], }) # Martian doesn't like empty chunk lists so create a chunk w/ empty data if len(chunks) == 0: return get_dummy_chunk() return {'chunks': chunks}
def main(args, outs): """Create files, some of which are returned in a structure.""" outs.bar = { 'bar': args.foo + 3, 'file1': martian.make_path('file1'), 'file2': martian.make_path('file2'), } with open(outs.bar['file1'], 'w') as file1: file1.write(str(args.foo)) with open(outs.bar['file2'], 'w') as file2: file2.write(str(args.foo + 1)) with open(outs.file3, 'w') as file3: file3.write(str(args.foo + 2))
def main(args, outs): np.random.seed(0) subsample_rate = args.subsample_info.get('subsample_rate') if subsample_rate is None: return mol_counter = MoleculeCounter.open(args.molecule_info, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) # Subsample the matrices subsample_result = {} subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter( mol_counter, subsample_rate=subsample_rate, subsample_result=subsample_result) # Filter the subsampled matrices filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) subsampled_filt_mats = subsampled_raw_mats.filter_barcodes( filtered_bcs_per_genome) # Calculations for subsampled duplication rate reporter = cr_report.Reporter( genomes=map(str, mol_counter.get_ref_column('genome_ids')), subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES, subsample_depths=args.subsample_info['all_target_rpc']) reporter.subsampled_duplication_frac_cb( subsampled_raw_mats, mol_counter, args.subsample_info['subsample_rate'], args.subsample_info['subsample_type'], args.subsample_info['target_rpc'], subsample_result['mapped_reads'], ) mol_counter.close() reporter.save(outs.chunked_reporter) outs.subsampled_matrices = {} outs.subsampled_matrices['raw_matrices'] = martian.make_path( 'raw_matrices.h5') outs.subsampled_matrices['filtered_matrices'] = martian.make_path( 'filtered_matrices.h5') subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices']) subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
def split(args): """Compute base background in split and use it in each chunk.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} with open(args.globalGCdict, 'r') as f: GCdict = pickle.load(f) GCdict_paths = {} GCbins = sorted(GCdict.keys()) for gc in GCbins: GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format( gc[0], gc[1])) with open(GCdict_paths[gc], 'w') as dump: pickle.dump(GCdict[gc], dump) # write rows of each chunk to a new peak file mem_in_gb = 8 chunk_def = [{ '__mem_gb': mem_in_gb, '__vmem_gb': mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1, 'skip': False, 'GCdict': GCdict_paths[chunk] } for chunk in GCbins] return {'chunks': chunk_def}
def join(args, outs, chunk_defs, chunk_outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return reference = ReferenceManager(args.reference_path) contig_info_fn = martian.make_path("contig_info.json") with open(contig_info_fn, 'w') as outfile: contig_info = get_contig_info(args.reference_path) json.dump(contig_info, outfile) gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.feature_barcode_matrix, "--analysis", args.analysis, "--output", outs.output_for_cloupe, "--description", '"' + args.sample_desc + '"', "--peaks", args.peaks, "--fragmentsindex", args.fragments_index, "--geneannotations", reference.genes, "--contiginfo", contig_info_fn, ] if args.metrics_json is not None: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv is not None: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json is not None: call.extend(["--gemgroups", gem_group_index_json]) transcript_gene_types = get_annotation_gene_types(args) if transcript_gene_types is not None: call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError as e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): # Write read_chunk for consumption by Rust with open("chunk_args.json", "w") as f: json.dump(args.read_chunk, f) output_path = martian.make_path("") prefix = "fastq_chunk" chunk_reads_args = [ 'chunk_reads', '--reads-per-fastq', str(args.reads_per_file), output_path, prefix, "--martian-args", "chunk_args.json", '--compress', 'lz4' ] print "running chunk reads: [%s]" % str(chunk_reads_args) tk_subproc.check_call(chunk_reads_args) with open(os.path.join(output_path, "read_chunks.json")) as f: chunk_results = json.load(f) outs.out_chunks = [] # Write out a new chunk entry for each resulting chunk for chunk in chunk_results: print args.read_chunk chunk_copy = args.read_chunk.copy() print chunk_copy chunk_copy['read_chunks'] = chunk outs.out_chunks.append(chunk_copy)
def main_demultiplex_go(args, outs): data = { 'common_sample_indices': args.common_bcs, 'file_groups': [], } file_info = [IlmnFastqFile(x) for x in args.input_files] file_groups = groupby(lambda x: (x.s, x.lane, x.group), file_info).items() for (_, lane, _), input_files in file_groups: files = {read_type: [f for f in input_files if f.read == read_type][0].filename for read_type in args.read_types} data['file_groups'].append({ 'lane': lane, 'files': files, }) input_json_path = martian.make_path('godemux_input.json') with open(input_json_path, 'w') as f: json.dump(data, f) output_dir = outs.demultiplexed_fastq_path if args.split_by_tile: output_dir = os.path.join(output_dir, args.tile_folder) if not os.path.exists(output_dir): os.makedirs(output_dir) subproc_args = ['godemux', input_json_path, output_dir, outs.demultiplex_summary, '--demult-read', args.si_read_type, '--chunk', str(args.chunk_number)] if args.rc_i2_read: subproc_args += ['--rci2read'] martian.check_call(subproc_args)
def main(args, outs): in_bam = pysam.Samfile(args.possorted_bam) bcs = {x.rstrip() for x in open(args.cell_barcodes)} txs = [GenePredTranscript(x) for x in args.tx_subset] results = [[tx.name, find_recs(tx, in_bam, bcs)] for tx in txs] outs.pickle = martian.make_path('subset_results.pickle') with open(outs.pickle, 'w') as outf: pickle.dump(results, outf)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return gg_id_to_batch_id, batch_id_to_name = {}, {} for lib in args.library_info: gg_id_to_batch_id[lib['gem_group']] = lib['batch_id'] batch_id_to_name[lib['batch_id']] = lib['batch_name'] matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5) matrix = matrix.select_features_by_type(GENE_EXPRESSION_LIBRARY_TYPE) batch_ids = np.array([gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in matrix.bcs]) # select intersect of non-zero feature in each batch feature_mask = np.ones(matrix.features_dim) for b_id in batch_id_to_name: batch_bc_indices = np.where(batch_ids == b_id)[0] matrix_view = cr_matrix.CountMatrixView(matrix, bc_indices=batch_bc_indices) feature_mask = np.logical_and(feature_mask, matrix_view.sum(axis=1)) matrix = matrix.select_features(np.flatnonzero(feature_mask)) # filter barcodes with zero count bc_indices = np.flatnonzero(matrix.get_counts_per_bc()) matrix = matrix.select_barcodes(bc_indices) # l2 norm matrix.m = matrix.m.astype('float64') cr_matrix.inplace_csc_column_normalize_l2(matrix.m) n_pcs = args.num_pcs if args.num_pcs is not None else analysis_constants.CBC_N_COMPONENTS_DEFAULT dimred_matrix = fbpca_reduce_dimension(matrix, n_pcs) outs.dimred_matrix = martian.make_path('dimred_matrix.pickle') with open(outs.dimred_matrix, 'wb') as fp: cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL) bc_feature_info = { 'barcodes' : matrix.bcs, 'features' : matrix.feature_ref.feature_defs, } outs.matrix_barcode_feature_info = martian.make_path('matrix_barcode_feature_info.pickle') with open(outs.matrix_barcode_feature_info, 'wb') as fp: cPickle.dump(bc_feature_info, fp, cPickle.HIGHEST_PROTOCOL)
def split(args): # Write BAM comments to json file bam_comment_fn = martian.make_path('bam_comments.json') with open(bam_comment_fn, 'w') as f: json.dump(args.bam_comments, f) # Write library info to a file libraries_fn = martian.make_path('libraries.json') with open(libraries_fn, 'w') as f: json.dump(tk_safe_json.json_sanitize(args.library_info), f, indent=4, sort_keys=True) chunks = [] for chunk_genome_input, tags, gem_group, library_type, library_id, in itertools.izip_longest( args.genome_inputs, args.tags, args.gem_groups, args.library_types, args.library_ids): gem_group_str = str(gem_group) if gem_group_str in args.skip_translate and library_type in args.skip_translate[ gem_group_str]: this_skip_translate = args.skip_translate[gem_group_str][ library_type] else: this_skip_translate = True chunks.append({ 'chunk_genome_input': chunk_genome_input, 'chunk_tags': tags, 'gem_group': gem_group, 'library_type': library_type, 'library_id': library_id, 'library_info_json': libraries_fn, 'bam_comments_json': bam_comment_fn, 'skip_translate': this_skip_translate, '__mem_gb': 4, }) join = { '__mem_gb': 12, } return {'chunks': chunks, 'join': join}
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() results = flatten_list_of_lists( [pickle.load(open(chunk.subset_results)) for chunk in chunk_outs]) results = flatten_list_of_lists(results) tx_dict = get_gene_pred_dict(args.transcripts) results = parse_results(results, tx_dict, args.kit_type) outs.results = martian.make_path('results.csv') df = pd.DataFrame( results, columns=['tx_id', 'tx_position', 'read_molecule_fraction']) df.to_csv(outs.results)
def split(args): out_json_file = martian.make_path('snps.json') min_snp_call_qual = args.min_snp_call_qual if args.min_snp_call_qual is not None \ else snp_constants.DEFAULT_MIN_SNP_CALL_QUAL save_snps(out_json_file, args.variants, min_snp_call_qual) chunks = [{ 'chunk_variants': chunk_variants, 'snps': out_json_file } for chunk_variants in args.variants] return {'chunks': chunks}
def main(args, outs): genome_fasta_path = cr_utils.get_reference_genome_fasta( args.reference_path) chrom, start, stop = args.locus bed_path = martian.make_path('region.bed') with open(bed_path, 'w') as f: f.write(chrom + "\t" + str(start) + "\t" + str(stop) + "\n") # Correct the STAR mapping from 255 to 60 and take care of split reads output_bam = martian.make_path('output.bam') star_args = [ 'gatk-launch', 'SplitNCigarReads', '-R', genome_fasta_path, '-I', args.input, '-L', bed_path, '-O', output_bam, '--skip-mapping-quality-transform', 'false', '--create-output-bam-index', 'false', '--TMP_DIR', os.getcwd() ] subprocess.check_call(star_args)
def main(args, outs): genome_fasta_path = cr_utils.get_reference_genome_fasta(args.reference_path) chrom, start, stop = tk_io.get_locus_info(args.locus) bed_path = martian.make_path('region.bed') with open(bed_path, 'w') as f: f.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\n") freebayes_args = ['freebayes', '-f', genome_fasta_path, '-b', args.input, '-0', '-t', bed_path] with open(outs.output, 'w') as f: subprocess.check_call(freebayes_args, stdout=f)
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = { int(k): v for k, v in args.chunks_per_gem_group.iteritems() } with open(args.read1s_chunk) as f1: read1s = [read for read in tk_fasta.read_generator_fastq(f1)] with open(args.read2s_chunk) as f2: read2s = [read for read in tk_fasta.read_generator_fastq(f2)] assert len(read1s) == len(read2s) fastqs_out = {} buckets = {} outs.buckets = {} for gem_group, bucket_name in enumerate_bucket_names( args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) fastqs_out[bucket_name] = open(filename, 'w') outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for read1, read2 in itertools.izip(read1s, read2s): barcode = vdj_utils.get_fastq_read_barcode(read1) # Exclude unbarcoded reads if barcode is None: continue assert barcode == vdj_utils.get_fastq_read_barcode(read2) barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append(read1) buckets[bucket_name].append(read2) # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) fastq_out = fastqs_out[bucket_name] for read in bucket: tk_fasta.write_read_fastq(fastq_out, *read) fastq_out.close()
def main(args, outs): """For each slice produce a fasta file sampling reads from that slice. We split our section of the genome into a bunch of 20kb chunks. For each chunk we sample an identical number of paired end reads. The name of each read encodes the true position that it was sampled from.""" # Grab basic stats for the read lengths and quality scores stats_fp = open(args.basic_stats) stats = json.load(stats_fp) # Fix the random seed np.random.seed(0) # Info is a map we use everywhere to track the sampling parameters. # r1_len: the length of read1 # r2_len: the length of read2 # insert_size_map: a map of insert-size (as a string) to frequency # q_score_map a map of quality score (as a string) to frequency info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']} info['q_score_map'] = { '30': stats['bc_q30_bases'], '20': stats['bc_q20_bases'] - stats['bc_q30_bases'], '0': stats['bc_tot_bases'] - stats['bc_q20_bases'] } stats_is_fp = open(args.insert_sizes) info['insert_size_map'] = json.load(stats_is_fp)['60'] # How many samples will we make from each window? samples = int( round(2.0 * args.target_coverage * (float(args.window_size) / (stats['r1_len'] + stats['r2_len'])))) martian.log_info("Using %i samples per %i bin" % (samples, args.window_size)) output_path = martian.make_path("chnk.fasta") output = open(output_path, "w") ref = reference.open_reference(args.reference_path) #Loop over every window in every loci. for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): cur = start while (cur < end): # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put # the results in the output file perbin(chrom, cur, ref, output, info, args.window_size, samples) cur += args.window_size outs.tmp = output_path outs.samples_per_bin = samples output.close()
def join(args, outs, chunk_defs, chunk_outs): # mapping of cluster ID -> VCFs to_merge = collections.defaultdict(list) for o, d in zip(chunk_outs, chunk_defs): to_merge[d.cluster_id].append(o.variant_subset) # merge each VCF subset for a cluster merged_vcfs = [] for cluster_id, vcf_list in to_merge.iteritems(): merged_vcf = martian.make_path('{}.vcf'.format(cluster_id)) tk_io.combine_vcfs(merged_vcf, vcf_list) merged_vcfs.append(merged_vcf + '.gz') # final merge to make one combined VCF tmp = martian.make_path('tmp.vcf') cmd = ['vcf-merge'] + merged_vcfs with open(tmp, 'w') as outf: subprocess.check_call(cmd, stdout=outf) # Sort and index the files tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', '')) tk_tabix.index_vcf(outs.variants.replace('.gz', '')) os.remove(tmp)
def main(args, outs): bam_in = tk_bam.create_bam_infile(args.chunk_input) # Get gem groups library_info = rna_library.get_bam_library_info(bam_in) gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info))) # Define buckets bucket_names = [] prefixes = cr_utils.get_seqs(args.nbases) for gg in gem_groups: for prefix in prefixes: bucket_names.append('%s-%d' % (prefix, gg)) bucket_names.append('') # Read all records reads = [read for read in bam_in] # Bucket the records bams_out = {} outs.buckets = {} buckets = {} for bucket_name in bucket_names: filename = martian.make_path("bc-%s.bam" % bucket_name) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in, rgs=args.read_groups, replace_rg=True) bams_out[bucket_name] = bam_out outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for r in reads: barcode = cr_utils.get_read_barcode(r) if barcode is None: bucket_name = '' else: barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) prefix = barcode_seq[:args.nbases] bucket_name = '%s-%d' % (prefix, gem_group) buckets[bucket_name].append(r) for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=cr_utils.barcode_sort_key) bam_out = bams_out[bucket_name] for r in bucket: bam_out.write(r) bam_out.close()
def split(args): vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode( args.vc_precalled, args.variant_mode) precalled_file = None if vc_mode == "precalled" or vc_mode == "precalled_plus": mem_gb = 8 threads = 1 precalled_file = martian.make_path("precalled_vcf.vcf") tenkit.log_subprocess.check_call( ['cp', precalled_filename, precalled_file]) tk_tabix.index_vcf(precalled_file) precalled_file = precalled_file + ".gz" if vc_mode != "precalled": if variant_caller == 'freebayes': mem_gb = 5 threads = 1 elif variant_caller == "gatk": mem_gb = 8 threads = 2 # make sure the gatk jar file exists if gatk_path is None: martian.throw( "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\"" ) gatk_loc = gatk_path if not (os.path.exists(gatk_loc)): martian.throw( "variant_caller 'gatk' selected, gatk jar file does not exist: %s" % gatk_loc) else: raise NotSupportedException('Variant caller not supported: ' + vc_mode) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) bam_chunk_size_gb = 3.0 if args.restrict_locus is None: loci = tk_chunks.get_sized_bam_chunks(args.input, bam_chunk_size_gb, contig_whitelist=primary_contigs, extra_args={ '__mem_gb': mem_gb, '__threads': threads, 'split_input': precalled_file }) else: loci = [{'locus': args.restrict_locus}] return {'chunks': loci}
def join(args, outs, chunk_defs, chunk_outs): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: outs.enrichment_analysis = None outs.enrichment_analysis_summary = {} return peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix) tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}} # for each method, we merge h5 files and copy csv directories to one place cr_io.mkdir(outs.enrichment_analysis, allow_existing=True) for method in args.factorization: method_dir = os.path.join(outs.enrichment_analysis, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method)) outs.enrichment_analysis_summary['h5'][method] = _h5 chunk_h5s = [] _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method)) outs.enrichment_analysis_summary['csv'][method] = _csv diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs] if args.filtered_tf_bc_matrix is not None: diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs] clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.clustering_key == key], key=lambda x: x[1].cluster) chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering] # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering])) # write out h5 chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key)) with analysis_io.open_h5_for_writing(chunk_h5) as f: cr_diffexp.save_differential_expression_h5(f, key, diffexp) chunk_h5s += [chunk_h5] # write out csv cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv) analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_MAP_DE[method]])
def main(args, outs): outs.updated_sample_def = args.sample_def.copy() if args.mol_h5_version == 2: v2_mole_info_h5 = args.sample_def[cr_constants.AGG_H5_FIELD] v2_file_basename = os.path.basename(v2_mole_info_h5) v3_filename = '{x[0]}_v3_{x[2]}{x[1]}'.format( x=list(os.path.splitext(v2_file_basename)) + [datetime.datetime.now().isoformat()]) out_v3_mole_info_h5 = martian.make_path(v3_filename) cr_mol_counter.MoleculeCounter.convert_v2_to_v3( v2_mole_info_h5, out_v3_mole_info_h5) outs.updated_sample_def[ cr_constants.AGG_H5_FIELD] = out_v3_mole_info_h5
def main(args, outs): if args.skip: return dimred_matrix_file = args.dimred_matrix if args.ordered_dimred_matrix is None else args.ordered_dimred_matrix with open(dimred_matrix_file) as fp: dimred_matrix = cPickle.load(fp) cbc_knn = option(args.cbc_knn, analysis_constants.CBC_KNN) batch_to_bc_indices = args.batch_to_bc_indices batch_start_idx = batch_to_bc_indices[args.batch_id][0] batch_end_idx = batch_to_bc_indices[args.batch_id][1] cur_matrix = dimred_matrix[batch_start_idx:batch_end_idx, :] # nearest neighbor pair: stores the nearest neighbors from match_i to match_j # key = (batch_i, batch_j), values = set((idx_i, idx_j), ...), the index here is the global index batch_nearest_neighbor = defaultdict(set) from_idx, to_idx = None, None # Batch balanced KNN for batch in xrange(len(args.batch_to_bc_indices)): if batch == args.batch_id: continue ref_matrix = dimred_matrix[ batch_to_bc_indices[batch][0]:batch_to_bc_indices[batch][1], ] nn_idx_right = find_knn(cur_matrix, ref_matrix, cbc_knn) # convert index (in cur_matrix and ref_matrix) to global index (in dimred_matrix) nn_idx_left = np.repeat( np.arange(cur_matrix.shape[0]) + batch_start_idx, cbc_knn) nn_idx_right += batch_to_bc_indices[batch][0] from_idx = nn_idx_left if from_idx is None else np.concatenate( [from_idx, nn_idx_left]) to_idx = nn_idx_right if to_idx is None else np.concatenate( [to_idx, nn_idx_right]) for i, j in izip(from_idx, to_idx): batch_nearest_neighbor[(args.batch_id, batch)].add((i, j)) outs.batch_nearest_neighbor = martian.make_path( 'batch_nearest_neighbor.binary') with open(outs.batch_nearest_neighbor, 'wb') as fp: serialize_batch_nearest_neighbor(fp, batch_nearest_neighbor) return