def get_vdj_feature_iter(reference_path): """ Yield vdj features from a vdj reference fasta file """ if reference_path is None: return for header, sequence in cr_utils.get_fasta_iter(open(get_vdj_reference_fasta(reference_path))): yield parse_fasta_entry(header, sequence)
def check(input, trimmer_out, cutadapt_out): correct_untrimmed = 0 # Not trimmed by both correct_trimmed = 0 # Consistently trimmed by both rust_trimmed_only = 0 # Trimmed only by rust rust_untrimmed_only = 0 # Not trimmed only by rust inconsistent_trimmed = 0 # Trimmed inconsistently with open(cutadapt_out) as fcut, open(trimmer_out) as frust, open( input) as finput: cutiter = cr_utils.get_fasta_iter(fcut) rustiter = cr_utils.get_fasta_iter(frust) inputiter = cr_utils.get_fasta_iter(finput) for (c, r, u) in zip(cutiter, rustiter, inputiter): if c == u and r == u: correct_untrimmed += 1 elif c == u: rust_trimmed_only += 1 elif r == u: rust_untrimmed_only += 1 elif c == r or abs(len(r[1]) - len(c[1])) <= 2: # Allow upto a slop of two correct_trimmed += 1 else: inconsistent_trimmed += 1 # Sensitivity = (Rust & Cutadapt) / Cutadapt sensitivity = float(correct_trimmed + inconsistent_trimmed) / float( correct_trimmed + inconsistent_trimmed + rust_untrimmed_only) # PPV = (Rust & Cutadapt)/Rust ppv = float(correct_trimmed + inconsistent_trimmed) / float( correct_trimmed + inconsistent_trimmed + rust_trimmed_only) # Concordance = (Rust & Cutadapt & Rust==Cutadpt) / (Rust & Cutadapt) concordance = float(correct_trimmed) / float(correct_trimmed + inconsistent_trimmed) return sensitivity, ppv, concordance
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = collections.defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) reporter.report_summary_json(outs.summary)
def build_reference_fasta_from_fasta(fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a V(D)J segment FASTA file. """ seen_features = set() seen_ids = set() features = [] print 'Checking FASTA entries...' with open(fasta_path) as f: for header, sequence in cr_utils.get_fasta_iter(f): feat = parse_fasta_entry(header, sequence) # Enforce unique feature IDs if feat.feature_id in seen_ids: raise ValueError( 'Duplicate feature ID found in input FASTA: %d.' % feat.feature_id) # Sanity check values if ' ' in feat.region_type: raise ValueError('Spaces not allowed in region type: "%s"' % feat.region_type) if ' ' in feat.gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % feat.gene_name) if ' ' in feat.record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % feat.record_id) key = get_duplicate_feature_key(feat) if key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( feat.display_name, feat.region_type, feat.record_id) continue # Strip Ns from termini seq = feat.sequence if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \ str((feat.display_name, feat.record_id, feat.region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % \ str((feat.display_name, feat.record_id, feat.region_type)) continue # Warn on features we couldn't classify properly if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \ (str((feat.display_name, feat.record_id, feat.region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue seen_ids.add(feat.feature_id) seen_features.add(key) # Update the sequence since we may have modified it feat_dict = feat._asdict() feat_dict.update({'sequence': seq}) new_feat = VdjAnnotationFeature(**feat_dict) features.append(new_feat) print '...done.\n' print 'Writing sequences...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta: for feat in features: out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n') print '...done.\n' print 'Computing hash of input FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(fasta_path) print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: None, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: None, cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def main(args, outs): if args.vdj_reference_path is None: outs.chunked_annotations = None return chunk_contigs = [] barcodes_in_chunk = set(args.barcodes) # Set of barcodes that were called as cells if args.cell_barcodes: cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) else: cell_barcodes_set = set() # Setup feature reference sequences res = vdj_annot.setup_feature_aligners(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes) feature_types, feature_aligners, feature_filters = res # Setup primer reference sequnces if args.primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers, vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO) read_counts = {} umi_counts = {} if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t') for _, row in contig_summary.iterrows(): read_counts[row.contig_name] = int(row.num_reads) umi_counts[row.contig_name] = int(row.num_umis) if args.filter_summary: try: filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary)) except EmptyDataError: filter_summary = None else: filter_summary = None if not args.contigs_fastq is None: fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False) for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)): if args.contigs_fastq is None: contig_quals = None else: header_fq, contig_sequence_fq, contig_quals = fq_iter.next() assert(contig_sequence_fq == contig_sequence) assert(header_fq == header) barcode = vdj_utils.get_barcode_from_contig_name(header) contig_name = header.split(' ')[0] # Only annotate barcodes assigned to this chunk and contigs with enough read support if barcode in barcodes_in_chunk: if filter_summary is not None: filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name) else: filtered = True contig = vdj_annot.AnnotatedContig(contig_name, contig_sequence, quals=contig_quals, barcode=barcode, is_cell=barcode in cell_barcodes_set, filtered=filtered, read_count=read_counts.get(contig_name), umi_count=umi_counts.get(contig_name), ) contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if args.primers: contig.primer_annotations = contig.annotate_features_by_group(primer_aligner, alignment_filter=primer_filter) contig.annotate_cdr3() chunk_contigs.append(contig) cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table( args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered( filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) ## Compute post-assembly per-cell metrics # Load the assembly metrics summary to get the total assemblable reads if args.assemble_metrics_summary and args.reads_summary: assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes) total_read_pairs = cr_utils.get_metric_from_json( args.reads_summary, 'total_read_pairs') reporter._get_metric_attr( 'vdj_assemblable_read_pairs_per_filtered_bc').set_value( assemblable_read_pairs, len(barcodes)) reporter._get_metric_attr('vdj_sequencing_efficiency').set_value( assemblable_read_pairs, total_read_pairs) ## Try to autodetect the chain type # Find all chains w/ a significant presence. # If there's exactly one, set the chain type filter to that. # Otherwise, show all chain types. chain_count = defaultdict(int) for anno_dict in contig_annotations.itervalues(): contig = vdj_annotations.AnnotatedContig.from_dict( anno_dict, reference) if contig.is_cell and contig.high_confidence and contig.productive: for anno in contig.annotations: if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES: chain_count[anno.feature.chain_type] += 1 outs.chain_type = vdj_constants.ALL_CHAIN_TYPES print chain_count if len(chain_count) > 0: n_contigs = sum(chain_count.itervalues()) sig_chains = [ ct for ct, count in chain_count.iteritems() if tk_stats.robust_divide( count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC ] if len(sig_chains) == 1: outs.chain_type = sig_chains[0] reporter.report_summary_json(outs.summary)