def join(args, outs, chunk_defs, chunk_outs): if args.skip: return chunk_out = chunk_outs[0] cr_io.copy(chunk_out.pca_h5, outs.pca_h5) cr_io.copytree(chunk_out.pca_csv, outs.pca_csv)
def main(args, outs): list_of_files = [ args.protospacer_calls_summary, args.protospacer_calls_per_cell, args.cells_per_protospacer, args.protospacer_umi_thresholds_csv, args.protospacer_umi_thresholds_json, args.perturbation_efficiencies_by_feature, args.perturbations_efficiencies_by_target ] cr_io.makedirs(outs.crispr_analysis, allow_existing=True) for (file_path, file_name) in itertools.izip( list_of_files, protospacer_calling.CRISPR_ANALYSIS_FILE_NAMES): if file_path is None: continue cr_io.copy(file_path, os.path.join(outs.crispr_analysis, file_name)) if os.path.isdir(args.perturbation_effects_by_feature): perturbation_effects_by_feature_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_feature') cr_io.makedirs(perturbation_effects_by_feature_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_feature, perturbation_effects_by_feature_dir, allow_existing=True) if os.path.isdir(args.perturbation_effects_by_target): perturbation_effects_by_target_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_target') cr_io.makedirs(perturbation_effects_by_target_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_target, perturbation_effects_by_target_dir, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): if args.skip or not args.is_multi_genome: return chunk_out = chunk_outs[0] cr_io.copy(chunk_out.multi_genome_summary, outs.multi_genome_summary) cr_io.copytree(chunk_out.multi_genome_csv, outs.multi_genome_csv) cr_io.copytree(chunk_out.multi_genome_json, outs.multi_genome_json)
def main(args, outs): if args.skip: return for h5, csv in zip(args.pca_h5_list, args.pca_csv_list): if h5 is not None and csv is not None: cr_io.copy(h5, outs.pca_h5) cr_io.copytree(csv, outs.pca_csv)
def main(args, outs): if args.selector: if args.peaks1 is None: raise IOError("Input peaks file 1 is not present") cr_io.copy(args.peaks1, outs.peaks) else: if args.peaks2 is None: raise IOError("Input peaks file 2 is not present") cr_io.copy(args.peaks2, outs.peaks)
def main(args, outs): parsed = parse_parameters(args.params_csv) for param in ANALYSIS_PARAMS: if param in parsed: setattr(outs, param, parsed[param]) else: setattr(outs, param, None) if args.params_csv is not None: cr_io.copy(args.params_csv, outs.params_csv)
def write_genome_fasta(self, out_fasta_fn): if len(self.genomes) > 1: with open(out_fasta_fn, 'w') as f: for genome_prefix, in_fasta_fn in itertools.izip(self.genome_prefixes, self.in_fasta_fns): with open(in_fasta_fn, 'r') as g: for line in g: line = line.strip() if line.startswith('>'): line = '>' + genome_prefix + '_' + line[1:] f.write(line + '\n') else: cr_io.copy(self.in_fasta_fns[0], out_fasta_fn)
def get_gem_group_index_json(args, outs): if args.gem_group_index_json: cr_io.copy(args.gem_group_index_json, outs.gem_group_index_json) else: generated_index = cr_matrix.get_gem_group_index( args.feature_barcode_matrix) if generated_index is not None: with open(outs.gem_group_index_json, 'w') as outfile: tk_json.dump_numpy({"gem_group_index": generated_index}, outfile) else: outs.gem_group_index_json = None return outs.gem_group_index_json
def join(args, outs, chunk_defs, chunk_outs): chunk_out = chunk_outs[0] cr_io.copy(chunk_out.web_summary, outs.web_summary) cr_io.copy(chunk_out.alerts, outs.alerts) cr_io.copy(chunk_out.metrics_summary_json, outs.metrics_summary_json) cr_io.copy(chunk_out.metrics_summary_csv, outs.metrics_summary_csv)
def join(args, outs, chunk_defs, chunk_outs): # Copy files from single chunk to join for out_name in [ 'summary', 'clonotype_assignments', 'contig_annotations', 'contig_annotations_csv', 'filtered_contig_annotations_csv', 'contig_annotations_pickle', ]: src = getattr(chunk_outs[0], out_name) dest = getattr(outs, out_name) if os.path.isfile(src): cr_io.copy(src, dest) else: setattr(outs, out_name, None)
def main(args, outs): if args.selector: if args.cell_barcodes1 is None: raise IOError("Input barcodes file 1 is not present") cr_io.copy(args.cell_barcodes1, outs.cell_barcodes) cr_io.copy(args.metrics1, outs.metrics) else: if args.cell_barcodes2 is None: raise IOError("Input barcodes file 2 is not present") cr_io.copy(args.cell_barcodes2, outs.cell_barcodes) cr_io.copy(args.metrics2, outs.metrics)
def join(args, outs, chunk_defs, chunk_outs): cr_io.copy(args.extract_reads_summary, outs.summary) cr_io.copy(args.barcode_counts, outs.barcode_counts) cr_io.copy(args.feature_counts, outs.feature_counts) outs.gem_groups = args.gem_groups outs.library_types = args.library_types outs.library_ids = args.library_ids outs.read_groups = args.read_groups outs.align = args.align outs.bam_comments = args.bam_comments outs.read1s = [co.read1s for co in chunk_outs] outs.read2s = [co.read2s for co in chunk_outs] outs.tags = [co.tags for co in chunk_outs]
def main(args, outs): if args.read1 is not None: # Ensure same extension out_path, _ = cr_utils.splitexts(outs.read1s) _, in_ext = cr_utils.splitexts(args.read1) outs.read1s = out_path + in_ext cr_io.copy(args.read1, outs.read1s) if args.read2 is not None: out_path, _ = cr_utils.splitexts(outs.read2s) _, in_ext = cr_utils.splitexts(args.read2) outs.read2s = out_path + in_ext cr_io.copy(args.read2, outs.read2s) if args.chunk_tags is not None: out_path, _ = cr_utils.splitexts(outs.tags) _, in_ext = cr_utils.splitexts(args.chunk_tags) outs.tags = out_path + in_ext cr_io.copy(args.chunk_tags, outs.tags)
def join(args, outs, chunk_defs, chunk_outs): summary_files = [ args.extract_reads_summary, args.correct_barcodes_summary, args.trim_reads_summary, ] summary_files = [ sum_file for sum_file in summary_files if not sum_file is None ] cr_report.merge_jsons(summary_files, outs.summary) cr_io.copy(args.raw_barcode_counts, outs.raw_barcode_counts) cr_io.copy(args.corrected_barcode_counts, outs.corrected_barcode_counts) cr_io.copy(args.barcode_summary, outs.barcode_summary) outs.gem_groups = args.gem_groups outs.read_groups = args.read_groups outs.align = args.align outs.bam_comments = args.bam_comments outs.read1s = [co.read1s for co in chunk_outs] outs.read2s = [co.read2s for co in chunk_outs] outs.corrected_bcs = [co.corrected_bcs for co in chunk_outs]
def join(args, outs, chunk_defs, chunk_outs): outs.chain_type = chunk_outs[0].chain_type cr_io.copy(chunk_outs[0].summary, outs.summary)
def join(args, outs, chunk_defs, chunk_outs): for infile, outfile in zip([args.fragments, args.fragments_index, args.aggr_csv], [outs.fragments, outs.fragments_index, outs.aggr_csv]): if infile is None: outfile = infile else: cr_io.copy(infile, outfile)
def main(args, outs): cr_io.copy(args.trim_reads_summary, outs.summary)
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path, genome_fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files. Input files are concatenated. No attempt to merge/reconcile information across them is made. Providing the files in a different order might change the output in cases where there are multiple entries with the same transcript id and the same feature type (eg. V-region). """ transcripts = collections.defaultdict(list) if transcripts_to_remove_path: with open(transcripts_to_remove_path) as f: rm_transcripts = set([line.strip() for line in f.readlines()]) else: rm_transcripts = set() # Note: We cannot symlink here because some filesystems in the wild # do not support symlinks. print 'Copying genome reference sequence...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta') cr_io.copy(genome_fasta_path, tmp_genome_fa_path) print '...done.\n' print 'Indexing genome reference sequence...' tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path]) print '...done.\n' print 'Loading genome reference sequence...' genome_fasta = pysam.FastaFile(tmp_genome_fa_path) print '...done.\n' print 'Computing hash of genome FASTA file...' fasta_hash = cr_io.compute_hash_of_file(tmp_genome_fa_path) print '...done.\n' for gtf in gtf_paths: print 'Reading GTF {}'.format(gtf) for line_no, entry in enumerate(get_gtf_iter(open(gtf))): if not entry.feature in [ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE]: continue entry = parse_attributes(entry) transcript_id = entry.attributes.get('transcript_id') transcript_biotype = entry.attributes.get('transcript_biotype') gene_biotype = entry.attributes.get('gene_biotype') gene_name = entry.attributes.get('gene_name') # Skip irrelevant biotypes if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES: continue # Skip blacklisted gene names if transcript_id in rm_transcripts: continue # Warn and skip if transcript_id missing if transcript_id is None: print 'Warning: Entry on row %d has no transcript_id' % line_no continue # Warn and skip if gene_name missing if gene_name is None: print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % (transcript_id, line_no, transcript_biotype) continue # Infer region type from biotype if transcript_biotype in ENSEMBL_VDJ_BIOTYPES: vdj_feature = infer_ensembl_vdj_feature_type(entry.feature, transcript_biotype) else: vdj_feature = infer_ensembl_vdj_feature_type(entry.feature, gene_biotype) # Warn and skip if region type could not be inferred if vdj_feature is None: print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % (transcript_id, transcript_biotype) continue # Features that share a transcript_id and feature type are presumably exons # so keep them together. transcripts[(transcript_id, vdj_feature)].append(entry) print '...done.\n' print 'Computing hash of genes GTF files...' digest = hashlib.sha1() # concatenate all the hashes into a string and then hash that string digest.update(reduce(lambda x,y: x+y, [cr_io.compute_hash_of_file(gtf) for gtf in gtf_paths])) gtf_hash = digest.hexdigest() print '...done.\n' print 'Fetching sequences...' out_fasta = open(get_vdj_reference_fasta(reference_path), 'w') feature_id = 1 seen_features = set() for (transcript_id, region_type), regions in transcripts.iteritems(): if not all(r.chrom == regions[0].chrom for r in regions): chroms = sorted(list(set([r.chrom for r in regions]))) print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % (transcript_id, str(chroms)) continue if not all(r.strand == regions[0].strand for r in regions): print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id continue chrom = regions[0].chrom strand = regions[0].strand ens_gene_name = standardize_ensembl_gene_name(regions[0].attributes['gene_name']) transcript_id = regions[0].attributes['transcript_id'] if chrom not in genome_fasta: print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % (transcript_id, chrom) continue # Build sequence regions.sort(key=lambda r: r.start) seq = '' for region in regions: # GTF coordinates are 1-based start, end = int(region.start)-1, int(region.end) seq += genome_fasta.fetch(chrom, start, end) # Revcomp if transcript on reverse strand if strand == '-': seq = tk_seq.get_rev_comp(seq) # Strip Ns from termini if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str((ens_gene_name, transcript_id, region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % str((ens_gene_name, transcript_id, region_type)) continue # Infer various attributes from the Ensembl gene name record_id = transcript_id gene_name = ens_gene_name display_name = make_display_name(gene_name=gene_name, allele_name=None) chain = infer_ensembl_vdj_chain(gene_name) chain_type = infer_ensembl_vdj_chain_type(gene_name) # Ensembl doesn't encode alleles allele_name = '00' # Disallow spaces in these fields if ' ' in region_type: raise ValueError('Spaces not allowed in region type: "%s"' % region_type) if ' ' in gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % gene_name) if ' ' in record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % record_id) # Warn on features we couldn't classify properly if chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print ('Warning: Could not infer chain type for: %s. ' + \ 'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \ (str((gene_name, record_id, region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES: isotype = infer_ensembl_isotype(ens_gene_name) else: isotype = None feature = VdjAnnotationFeature(feature_id=feature_id, record_id=record_id, display_name=display_name, gene_name=gene_name, region_type=region_type, chain_type=chain_type, chain=chain, isotype=isotype, allele_name=allele_name, sequence=seq, ) # Don't add duplicate entries feat_key = get_duplicate_feature_key(feature) if feat_key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (display_name, region_type, record_id) continue seen_features.add(feat_key) feature_id += 1 out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n') print '...done.\n' print 'Deleting copy of genome fasta...' os.remove(tmp_genome_fa_path) os.remove(tmp_genome_fa_path + '.fai') print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(genome_fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]), cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open(os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def join(args, outs, chunk_defs, chunk_outs): chunk_out = chunk_outs[0] cr_io.copy(chunk_out.summary, outs.summary)
def join(args, outs, chunk_defs, chunk_outs): """Merge sorted, downsampled fragments from each chunk, emit pre and post normalization sensitivity metrics per library and merge input peaks if provided""" with open(args.library_info, 'r') as f: library_info = pickle.load(f) ctg_mgr = ReferenceManager(args.reference_path) # Merge cell_barcodes cell_barcodes = {} for group in library_info: cell_barcodes_group = get_cell_barcodes(library_info[group]['cells'], args.reference_path, with_species=True) group_suffix = "-{}".format(group) for species in cell_barcodes_group.keys(): if species not in cell_barcodes.keys(): cell_barcodes[species] = set() cell_barcodes[species].update({ bc.split("-")[0] + group_suffix for bc in cell_barcodes_group[species] }) with open(outs.cell_barcodes, 'w') as f: for species in cell_barcodes: f.write(species + "," + ",".join(cell_barcodes[species]) + "\n") # Merge peaks if provided input_peaks = [ library_info[group]['peaks'] for group in library_info if 'peaks' in library_info[group] ] if len(input_peaks) == 1: cr_io.copy(input_peaks[0], outs.peaks) outs.skip_peakcalling = True if len(input_peaks) == 0: outs.peaks = 0 outs.skip_peakcalling = False if len(input_peaks) > 1: outs.skip_peakcalling = True # cat with open(outs.peaks, 'w') as outf: for ip in input_peaks: with open(ip, 'r') as inf: for line in inf: outf.write(line) # sort peaks = BedTool(outs.peaks) peaks = peaks.sort(faidx=ctg_mgr.fasta_index) # merge peaks = peaks.merge(d=PEAK_MERGE_DISTANCE) peaks.saveas(outs.peaks) # override library name when aggring 1 library: if len(library_info) == 1: library_info[1]['library_info'] = "" # merge the metrics normalization_metrics = {} for cdef, cout in zip(chunk_defs, chunk_outs): with open(cout.normalization_metrics, 'r') as f: chunk_metrics = json.load(f) for key in chunk_metrics: normalization_metrics["{}_Library_{}".format( key, library_info[cdef.n]['library_id'])] = chunk_metrics[key] # aggregate some metrics across all libraries if key in [ 'total_pre_normalization', 'total_post_normalization' ]: if key not in normalization_metrics: normalization_metrics[key] = 0 normalization_metrics[key] += chunk_metrics[key] with open(outs.normalization_metrics, 'w') as f: json.dump(normalization_metrics, f, indent=4) # merge the fragments base_file, extension = os.path.splitext(outs.fragments) if not extension == '.gz': raise ValueError('Expecting compressed file output') input_tsvs = [str(chunk.fragments) for chunk in chunk_outs] merge_keyed_bed(input_tsvs, base_file, threads=martian.get_threads_allocation()) # index the fragments if os.path.getsize(base_file) == 0: outs.fragments = None outs.fragments_index = None else: # N.B. tabix_index will automatically compress the input file, adding the .gz suffix pysam.tabix_index(base_file, preset='bed', index=outs.fragments_index)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_io.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_io.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_io.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_io.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_io.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_io.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): cr_io.copy(chunk_outs[0].summary, outs.summary) if chunk_outs[0].report is not None: cr_io.copy(chunk_outs[0].report, outs.report) outs.chemistry_type = chunk_outs[0].chemistry_type