def test_chic_cigar_dedup(self): i = 0 with pysam.AlignmentFile('./data/chic_test_region.bam') as alignments: for molecule in MoleculeIterator(alignments, CHICMolecule, CHICFragment): i += 1 self.assertEqual(i, 1)
def recall_variants(args): variants, alignment_file_path, target_path, mode, germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size = args window_radius = 600 MAX_REF_MOLECULES = 1_000 # Maximum amount of reference molecules to process. # This is capped for regions to which many reads map (mapping artefact) variant_calls = dict() # cell->(chrom,pos) +/- ? ### Set up molecule iterator (1/2) if mode == 'NLA': mc = NlaIIIMolecule fc = NlaIIIFragment else: mc = Molecule fc = Fragment ### locations_done = set() alignments = pysam.AlignmentFile(alignment_file_path, threads=4) if germline_bam_path is not None: germline_alignments = pysam.AlignmentFile(germline_bam_path, threads=4) for variant in variants: # Check if the variant is present in the germline bam file (if supplied) if germline_bam_path is not None and has_variant_reads( germline_alignments, variant.chrom, variant.pos - 1, variant.alts[0], min_reads=1, stepper='nofilter'): print(f'FOUND IN GERMLINE {variant}') continue #print(variant) overlap = False reference_start = max(0, variant.pos - window_radius) reference_end = variant.pos + window_radius contig = variant.contig variant_key = (contig, variant.pos, variant.ref, variant.alts[0]) #print(contig,reference_start,reference_end,variant.alts[0],variant.ref) ### Set up allele resolver unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver( use_cache=False, phased=False, verbose=True) if germline_variants_path is not None: with pysam.VariantFile(germline_variants_path) as germline: for i, ar_variant in enumerate( germline.fetch(variant.chrom, reference_start, reference_end)): if germline_variants_sample is None: # If any of the samples is not heterozygous: continue if any((ar_variant.samples[sample].alleles != 2 for sample in ar_variant.samples)): continue elif len( set(ar_variant.samples[germline_variants_sample]. alleles)) != 2: continue unphased_allele_resolver.locationToAllele[ ar_variant.chrom][ar_variant.pos - 1] = { ar_variant.alleles[0]: {'U'}, ar_variant.alleles[1]: {'V'} } #### ref_phased = Counter() alt_phased = Counter() ### Set up molecule iterator (2/2) try: molecule_iter = MoleculeIterator(alignments, mc, fc, contig=contig, start=reference_start, end=reference_end, molecule_class_args={ 'allele_resolver': unphased_allele_resolver, 'max_associated_fragments': 20, }, max_buffer_size=max_buffer_size) reference_called_molecules = [] # molecule, phase extracted_base_call_count = 0 alt_call_count = 0 for mi, molecule in enumerate(molecule_iter): base_call = get_molecule_base_calls(molecule, variant) if base_call is None: continue extracted_base_call_count += 1 base, quality = base_call call = None if base == variant.alts[0]: call = 'A' alt_call_count += 1 if molecule.sample not in variant_calls: variant_calls[molecule.sample] = {} variant_calls[molecule.sample][variant_key] = 1 elif base == variant.ref: call = 'R' if call is None: continue # Obtain all germline variants which are phased : phased = get_phased_variants(molecule, unphased_allele_resolver) if call == 'R' and len(phased) > 0: # If we can phase the alternative allele to a germline variant # the reference calls can indicate absence if len(reference_called_molecules) < MAX_REF_MOLECULES: reference_called_molecules.append((molecule, phased)) for chrom, pos, base in phased: if call == 'A': alt_phased[(chrom, pos, base)] += 1 elif call == 'R': ref_phased[(chrom, pos, base)] += 1 except MemoryError: print(f"Buffer exceeded for {variant.contig} {variant.pos}") continue #print(mi,extracted_base_call_count,alt_call_count) if len(alt_phased) > 0 and len(reference_called_molecules): # Clean the alt_phased variants for variants which are not >90% the same alt_phased_filtered = filter_alt_calls(alt_phased, 0.9) #print(alt_phased_filtered) for molecule, phased_gsnvs in reference_called_molecules: for p in phased_gsnvs: if p in alt_phased_filtered: if not molecule.sample in variant_calls: variant_calls[molecule.sample] = {} variant_calls[molecule.sample][variant_key] = 0 break locations_done.add(variant_key) alignments.close() return variant_calls, locations_done
def Misc_contig_molecule_generator(molecule_iterator_args): for reference in input_bam.references: if not is_main_chromosome(reference): molecule_iterator_args['contig'] = reference yield from MoleculeIterator(**molecule_iterator_args)
def run_multiome_tagging(args): """ Run multiome tagging adds molecule information Arguments: bamin (str) : bam file to process o(str) : path to output bam file method(str): Protocol to tag, select from:nla, qflag, chic, nla_transcriptome, vasa, cs, nla_taps ,chic_taps, nla_no_overhang, scartrace qflagger(str): Query flagging algorithm to use, this algorithm extracts UMI and sample information from your reads. When no query flagging algorithm is specified, the `singlecellmultiomics.universalBamTagger.universalBamTagger.QueryNameFlagger` is used method(str) : Method name, what kind of molecules need to be extracted. Select from: nla (Data with digested by Nla III enzyme) qflag (Only add basic tags like sampple and UMI, no molecule assignment) chic (Data digested using mnase fusion) nla_transcriptome (Data with transcriptome and genome digested by Nla III ) vasa (VASA transcriptomic data) cs (CELseq data, 1 and 2) cs_feature_counts (Single end, deduplicate using a bam file tagged using featurecounts, deduplicates a umi per gene) fl_feature_counts (deduplicate using a bam file tagged using featurecounts, deduplicates based on fragment position) nla_taps (Data with digested by Nla III enzyme and methylation converted by TAPS) chic_taps (Data with digested by mnase enzyme and methylation converted by TAPS) chic_nla scartrace (lineage tracing protocol) custom_flags(str): Arguments passed to the query name flagger, comma separated "MI,RX,bi,SM" ref(str) : Path to reference fasta file, autodected from bam header when not supplied umi_hamming_distance(int) : Max hamming distance on UMI's head (int) : Amount of molecules to process contig (str) : only process this contig region_start(int) : Zero based start coordinate of single region to process region_end(int) : Zero based end coordinate of single region to process, None: all contigs when contig is not set, complete contig when contig is set. alleles (str) : path to allele VCF allele_samples(str): Comma separated samples to extract from the VCF file. For example B6,SPRET unphased_alleles(str) : Path to VCF containing unphased germline SNPs mapfile (str) : 'Path to \*.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py annotmethod (int) : Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base cluster (bool) : Run contigs in separate cluster jobs resolve_unproperly_paired_reads(bool) : When enabled bamtagmultiome will look through the complete bam file in a hunt for the mate, the two mates will always end up in 1 molecule if both present in the bam file. This also works when the is_proper_pair bit is not set. Use this option when you want to find the breakpoints of genomic re-arrangements. no_rejects(bool) : Do not write rejected reads mem (int) : Amount of gigabytes to request for cluster jobs time(int) : amount of wall clock hours to request for cluster jobs exons(str): Path to exon annotation GTF file introns(str): Path to intron annotation GTF file consensus(bool) : Calculate molecule consensus read, this feature is _VERY_ experimental consensus_model(str) : Path to consensus calling model, when none specified, this is learned based on the supplied bam file, ignoring sites supplied by -consensus_mask_variants consensus_mask_variants(str): Path VCF file masked for training on consensus caller consensus_n_train(int) : Amount of bases used for training the consensus model no_source_reads(bool) : Do not write original reads, only consensus scartrace_r1_primers(str) : comma separated list of R1 primers used in scartrace protocol """ MISC_ALT_CONTIGS_SCMO = 'MISC_ALT_CONTIGS_SCMO' every_fragment_as_molecule = args.every_fragment_as_molecule skip_contig = set(args.skip_contig.split(',')) if args.skip_contig is not None else set() if not args.o.endswith('.bam'): raise ValueError( "Supply an output which ends in .bam, for example -o output.bam") write_status(args.o,'unfinished') # Verify wether the input file is indexed and sorted... if not args.ignore_bam_issues: verify_and_fix_bam(args.bamin) for remove_existing_path in [args.o, f'{args.o}.bai']: if os.path.exists(remove_existing_path): print(f"Removing existing file {remove_existing_path}") os.remove(remove_existing_path) input_bam = pysam.AlignmentFile(args.bamin, "rb", ignore_truncation=args.ignore_bam_issues, threads=4) # autodetect reference: reference = None if args.ref is None: args.ref = get_reference_from_pysam_alignmentFile(input_bam) if args.ref is not None: try: reference = CachedFasta( pysam.FastaFile(args.ref)) print(f'Loaded reference from {args.ref}') except Exception as e: print("Error when loading the reference file, continuing without a reference") reference = None ##### Define fragment and molecule class arguments and instances: #### queryNameFlagger = None if args.qflagger is not None: if args.qflagger == 'custom_flags': queryNameFlagger = CustomAssingmentQueryNameFlagger( args.custom_flags.split(',')) else: raise ValueError("Select from 'custom_flags, ..' ") molecule_class_args = { 'umi_hamming_distance': args.umi_hamming_distance, 'reference': reference } fragment_class_args = { 'read_group_format' : args.read_group_format } yield_invalid = True # if invalid reads should be written yield_overflow = True # if overflow reads should be written if args.max_fragment_size is not None: fragment_class_args['max_fragment_size'] = args.max_fragment_size if args.no_rejects: yield_invalid = False if args.no_overflow: yield_overflow = False ignore_conversions = None if args.method == 'nla_taps' or args.method == 'chic_taps': ignore_conversions = set([('C', 'T'), ('G', 'A')]) if args.alleles is not None: molecule_class_args['allele_resolver'] = singlecellmultiomics.alleleTools.AlleleResolver( args.alleles, select_samples=args.allele_samples.split(',') if args.allele_samples is not None else None, lazyLoad=True, use_cache=args.use_allele_cache, verbose = args.set_allele_resolver_verbose, ignore_conversions=ignore_conversions) if args.mapfile is not None: molecule_class_args['mapability_reader'] = MapabilityReader( args.mapfile) ### Transcriptome configuration ### if args.method in ('nla_transcriptome', 'cs', 'vasa'): print( colorama.Style.BRIGHT + 'Running in transcriptome annotation mode' + colorama.Style.RESET_ALL) if args.exons is None : raise ValueError("Supply an exon GTF file") if args.introns is not None and args.exons is None: raise ValueError("Please supply both intron and exon GTF files") transcriptome_features = singlecellmultiomics.features.FeatureContainer() print("Loading exons", end='\r') transcriptome_features.loadGTF( args.exons, select_feature_type=['exon'], identifierFields=( 'exon_id', 'gene_id'), store_all=True, contig=args.contig, head=None) if args.introns is not None: print("Loading introns", end='\r') transcriptome_features.loadGTF( args.introns, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, contig=args.contig, head=None) print("All features loaded") # Add more molecule class arguments molecule_class_args.update({ 'features': transcriptome_features, 'auto_set_intron_exon_features': True }) ### Method specific configuration ### if args.method == 'qflag': moleculeClass = singlecellmultiomics.molecule.Molecule fragmentClass = singlecellmultiomics.fragment.Fragment # Write all reads yield_invalid = True elif args.method == 'chic': moleculeClass = singlecellmultiomics.molecule.CHICMolecule fragmentClass = singlecellmultiomics.fragment.CHICFragment elif args.method == 'nla' or args.method == 'nla_no_overhang': moleculeClass = singlecellmultiomics.molecule.NlaIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment if args.method == 'nla_no_overhang': assert reference is not None, 'Supply a reference fasta using -ref!' fragment_class_args.update({ 'reference': reference, 'no_overhang': True }) elif args.method == 'chic_nla': moleculeClass=singlecellmultiomics.molecule.CHICNLAMolecule fragmentClass=singlecellmultiomics.fragment.CHICFragment assert reference is not None, 'Supply a reference fasta using -ref!' molecule_class_args.update({ 'reference': reference, }) elif args.method == 'cs_feature_counts' : moleculeClass = singlecellmultiomics.molecule.Molecule fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment elif args.method == 'fl_feature_counts': moleculeClass = singlecellmultiomics.molecule.Molecule fragmentClass = singlecellmultiomics.fragment.FeatureCountsFullLengthFragment elif args.method == 'episeq' : moleculeClass = singlecellmultiomics.molecule.Molecule fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment elif args.method == 'nla_transcriptome': moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment molecule_class_args.update({ 'pooling_method': 1, # all data from the same cell can be dealt with separately 'stranded': None # data is not stranded }) elif args.method == 'nla_taps': moleculeClass = singlecellmultiomics.molecule.TAPSNlaIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment molecule_class_args.update({ 'reference': reference, 'taps': singlecellmultiomics.molecule.TAPS(reference=reference) }) elif args.method == 'chic_taps': molecule_class_args.update({ 'reference': reference, 'taps': singlecellmultiomics.molecule.TAPS(reference=reference) }) moleculeClass = singlecellmultiomics.molecule.TAPSCHICMolecule fragmentClass = singlecellmultiomics.fragment.CHICFragment elif args.method == 'vasa' or args.method == 'cs': moleculeClass = singlecellmultiomics.molecule.VASA fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript molecule_class_args.update({ 'pooling_method': 1, # all data from the same cell can be dealt with separately 'stranded': 1 # data is stranded }) elif args.method == 'scartrace': moleculeClass = singlecellmultiomics.molecule.ScarTraceMolecule fragmentClass = singlecellmultiomics.fragment.ScarTraceFragment r1_primers = args.scartrace_r1_primers.split(',') fragment_class_args.update({ 'scartrace_r1_primers': r1_primers, #'reference': reference }) else: raise ValueError("Supply a valid method") # Allow or disallow cycle shift: if args.allow_cycle_shift and fragmentClass is singlecellmultiomics.fragment.NLAIIIFragment: fragment_class_args['allow_cycle_shift'] = True # This disables umi_cigar_processing: if args.no_umi_cigar_processing: fragment_class_args['no_umi_cigar_processing'] = True if args.max_associated_fragments is not None: molecule_class_args['max_associated_fragments'] = args.max_associated_fragments # This decides what molecules we will traverse if args.contig == MISC_ALT_CONTIGS_SCMO: contig = None else: contig = args.contig # This decides to only extract a single genomic region: if args.region_start is not None: if args.region_end is None: raise ValueError('When supplying -region_start then also supply -region_end') region_start = args.region_start region_end = args.region_end else: region_start = None region_end = None last_update = datetime.now() init_time = datetime.now() if args.molecule_iterator_verbosity_interval is not None and (args.molecule_iterator_verbose or (args.stats_file_path is not None )): stats_handle = None if args.stats_file_path is not None: stats_handle = open(args.stats_file_path,'w') def progress_callback_function( iteration, mol_iter, reads ): nonlocal last_update nonlocal init_time nonlocal stats_handle now = datetime.now() diff = (datetime.now()-last_update).total_seconds() if diff>args.molecule_iterator_verbosity_interval: diff_from_init = (datetime.now()-init_time).total_seconds() _contig, _pos = None, None for read in reads: if read is not None: _contig, _pos = read.reference_name, read.reference_start if args.molecule_iterator_verbose: print( f'{mol_iter.yielded_fragments} fragments written, {mol_iter.deleted_fragments} fragments deleted ({(mol_iter.deleted_fragments/(mol_iter.deleted_fragments + mol_iter.yielded_fragments))*100:.2f} %), current pos: {_contig}, {_pos}, {mol_iter.waiting_fragments} fragments waiting ' , end='\r') if stats_handle is not None: stats_handle.write(f'{diff_from_init}\t{mol_iter.waiting_fragments}\t{mol_iter.yielded_fragments}\t{mol_iter.deleted_fragments}\t{_contig}\t{_pos}\n') stats_handle.flush() last_update = now else: progress_callback_function = None molecule_iterator_args = { 'alignments': input_bam, 'queryNameFlagger': queryNameFlagger, 'moleculeClass': moleculeClass, 'fragmentClass': fragmentClass, 'molecule_class_args': molecule_class_args, 'fragment_class_args': fragment_class_args, 'yield_invalid': yield_invalid, 'yield_overflow': yield_overflow, 'start':region_start, 'end':region_end, 'contig': contig, 'every_fragment_as_molecule': every_fragment_as_molecule, 'skip_contigs':skip_contig, 'progress_callback_function':progress_callback_function } if args.resolve_unproperly_paired_reads: molecule_iterator_args['iterator_class'] = MatePairIteratorIncludingNonProper if args.contig == MISC_ALT_CONTIGS_SCMO: # When MISC_ALT_CONTIGS_SCMO is set as argument, all molecules with reads # mapping to a contig returning True from the is_main_chromosome # function are used def Misc_contig_molecule_generator(molecule_iterator_args): for reference in input_bam.references: if not is_main_chromosome(reference): molecule_iterator_args['contig'] = reference yield from MoleculeIterator(**molecule_iterator_args) molecule_iterator = Misc_contig_molecule_generator( molecule_iterator_args) else: molecule_iterator = MoleculeIterator(**molecule_iterator_args) ##### consensus_model_path = None if args.consensus: # Load from path if available: if args.consensus_model is not None: if os.path.exists(args.consensus_model): model_path = args.consensus_model else: model_path = pkg_resources.resource_filename( 'singlecellmultiomics', f'molecule/consensus_model/{args.consensus_model}') if model_path.endswith('.h5'): try: from tensorflow.keras.models import load_model except ImportError: print("Please install tensorflow") raise consensus_model = load_model(model_path) else: with open(model_path, 'rb') as f: consensus_model = pickle.load(f) else: skip_already_covered_bases = not args.consensus_allow_train_location_oversampling if args.consensus_mask_variants is None: mask_variants = None else: mask_variants = pysam.VariantFile(args.consensus_mask_variants) print("Fitting consensus model, this may take a long time") consensus_model = singlecellmultiomics.molecule.train_consensus_model( molecule_iterator, mask_variants=mask_variants, n_train=args.consensus_n_train, skip_already_covered_bases=skip_already_covered_bases ) # Write the consensus model to disk consensus_model_path = os.path.abspath( os.path.dirname(args.o)) + '/consensus_model.pickle.gz' print(f'Writing consensus model to {consensus_model_path}') with open(consensus_model_path, 'wb') as f: pickle.dump(consensus_model, f) # We needed to check if every argument is properly placed. If so; the jobs # can be sent to the cluster if args.cluster: if args.contig is None: write_status(args.o,'Submitting jobs. If this file remains, a job failed.') # Create jobs for all chromosomes: unique_id = str(uuid.uuid4()) temp_prefix = os.path.abspath(os.path.dirname( args.o)) + '/SCMO_' + unique_id hold_merge = [] ## Create folder to store cluster files: if args.clusterdir is None: cluster_file_folder = os.path.abspath(os.path.dirname( args.o)) + '/cluster' else: cluster_file_folder = args.clusterdir print(f'Writing cluster scripts and standard out and error to {cluster_file_folder}') if not os.path.exists(cluster_file_folder): try: os.makedirs(cluster_file_folder,exist_ok=True) except Exception as e: print(e) pass found_alts = 0 files_to_merge = [] for ci,chrom in enumerate([_chrom for _chrom in (list(input_bam.references) + [MISC_ALT_CONTIGS_SCMO]) if not _chrom in skip_contig]): if not is_main_chromosome(chrom): found_alts += 1 continue if chrom == MISC_ALT_CONTIGS_SCMO and found_alts == 0: continue temp_bam_path = f'{temp_prefix}_{chrom}.bam' if os.path.exists(temp_bam_path): print(f"Removing existing temporary file {temp_bam_path}") os.remove(temp_bam_path) arguments = " ".join( [x for x in sys.argv if not x == args.o and x != '-o']) + f" -contig {chrom} -o {temp_bam_path}" files_to_merge.append(temp_bam_path) if consensus_model_path is not None: arguments += f' -consensus_model {consensus_model_path}' job = f'SCMULTIOMICS_{ci}_{unique_id}' write_status(temp_bam_path,'SUBMITTED') job_id = submit_job(f'{arguments};', job_name=job, target_directory=cluster_file_folder, working_directory=None, threads_n=1, memory_gb=args.mem, time_h=args.time, scheduler=args.sched, copy_env=True, email=None, mail_when_finished=False, hold=None,submit=True) print(f'Job for contig {chrom} submitted with job id: {job_id}') hold_merge.append(job_id) hold = hold_merge job = f'SCMULTIOMICS_MERGE_{unique_id}' if args.sched == 'local': hold = None final_status = args.o.replace('.bam','.status.txt') # Create list of output files command = f'samtools merge -@ 4 -c {args.o} {" ".join(files_to_merge)} && samtools index {args.o} && rm {temp_prefix}*.ba* && rm {temp_prefix}*.status.txt && echo "All done" > {final_status}' final_job_id = submit_job(f'{command};', job_name=job, target_directory=cluster_file_folder, working_directory=None, threads_n=4, memory_gb=10, time_h=args.time, scheduler=args.sched, copy_env=True, email=None, mail_when_finished=False, hold=hold,submit=True) print(f'final job id is:{final_job_id}') exit() ##### # Load unphased variants to memory unphased_allele_resolver = None if args.unphased_alleles is not None: unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver( use_cache=args.use_allele_cache, phased=False, ignore_conversions=ignore_conversions,verbose = args.set_allele_resolver_verbose) try: for i, variant in enumerate( pysam.VariantFile( args.unphased_alleles).fetch( args.contig)): if 'PASS' not in list(variant.filter): continue if not all( len(allele) == 1 for allele in variant.alleles) or len( variant.alleles) != 2: continue if sum([len(set(variant.samples[sample].alleles)) == 2 for sample in variant.samples]) < 2: # Not heterozygous continue unphased_allele_resolver.locationToAllele[variant.chrom][variant.pos - 1] = { variant.alleles[0]: {'U'}, variant.alleles[1]: {'V'}} except Exception as e: # todo catch this more nicely print(e) out_bam_path = args.o # Copy the header input_header = input_bam.header.as_dict() # Write provenance information to BAM header write_program_tag( input_header, program_name='bamtagmultiome', command_line=" ".join( sys.argv), version=singlecellmultiomics.__version__, description=f'SingleCellMultiOmics molecule processing, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') print(f'Started writing to {out_bam_path}') read_groups = dict() # Store unique read groups in this dict with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups) as out: try: for i, molecule in enumerate(molecule_iterator): # Stop when enough molecules are processed if args.head is not None and (i - 1) >= args.head: break # set unique molecule identifier molecule.set_meta('mi', f'{molecule.get_a_reference_id()}_{i}') # Write tag values molecule.write_tags() if unphased_allele_resolver is not None: # write unphased allele tag: molecule.write_allele_phasing_information_tag( unphased_allele_resolver, 'ua') # Update read groups for fragment in molecule: rgid = fragment.get_read_group() if not rgid in read_groups: read_groups[rgid] = fragment.get_read_group(True)[1] # Calculate molecule consensus if args.consensus: try: consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced( out, f'consensus_{molecule.get_a_reference_id()}_{i}', consensus_model, NUC_RADIUS=args.consensus_k_rad ) for consensus_read in consensus_reads: consensus_read.set_tag('RG', molecule[0].get_read_group()) consensus_read.set_tag('mi', i) out.write(consensus_read) except Exception as e: #traceback.print_exc() #print(e) molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True) molecule.write_pysam(out) # Write the reads to the output file if not args.no_source_reads: molecule.write_pysam(out) except Exception as e: write_status(args.o,'FAIL, The file is not complete') raise e # Reached the end of the generator write_status(args.o,'Reached end. All ok!')
'-head', type=int, help= 'Amount of random sequences to count, when not specified all random primers are counted' ) argparser.add_argument('-min_mq', type=int, default=50) argparser.add_argument('-o', type=str, default='./randomer_usage.pickle.gz', help='Output pickle/csv path') args = argparser.parse_args() with pysam.AlignmentFile(args.bamfile) as alignments: molecule_source = MoleculeIterator( alignments, molecule_class=NlaIIIMolecule, fragment_class=NlaIIIFragment, ) qf = get_random_primer_histogram(molecule_source, args.min_mq, args.max_size, args.size_bin_size, head=args.head) print('Writing dataframe to disk') if args.o.endswith('csv') or args.o.endswith('csv.gz'): qf.to_csv(args.o) else: qf.to_pickle(args.o) print('All done')
def obtain_conversions(contig: str): """ Create conversion dictionary for the suppled contig Args: contig (str) Returns: conversions_per_library (defaultdict( conversion_dict_stranded ) ) : Per library conversion dictionary n_molecules_per_library (Counter) : observed molecules per library contig(str) : the contig passed to the method temp_bam_path(str) : path to tagged bam file, tagged with gene annotations and 4su mutation count """ conversions_per_library = defaultdict(conversion_dict_stranded) n_molecules_per_library = Counter() from singlecellmultiomics.molecule import might_be_variant # Create temp directory to write tagged bam file to: temp_dir = args.temp_dir temp_bam_path = f'{temp_dir}/{contig}.bam' if not os.path.exists(temp_dir): try: os.makedirs(temp_dir) except Exception as e: pass # Load gene annotations for the selected contig: transcriptome_features = FeatureContainer() transcriptome_features.loadGTF(path=exons_gtf_path, select_feature_type=['exon'], identifierFields=('exon_id', 'gene_id'), store_all=True, contig=contig, head=None) transcriptome_features.loadGTF(path=introns_gtf_path, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, contig=contig, head=None) colormap = plt.get_cmap('RdYlBu_r') colormap.set_bad((0, 0, 0)) read_groups = {} try: with pysam.AlignmentFile(single_cell_bam_path, threads=4) as alignments, \ pysam.VariantFile(known_vcf_path) as known, \ sorted_bam_file(temp_bam_path, origin_bam=single_cell_bam_path, read_groups=read_groups, fast_compression=True) as out, \ pysam.FastaFile(reference_path) as reference_handle: # Cache the sequence of the contig: (faster) reference = CachedFasta(reference_handle) for n_molecules, molecule in enumerate( MoleculeIterator(alignments, TranscriptMolecule, SingleEndTranscriptFragment, fragment_class_args={ 'stranded': True, 'features': transcriptome_features }, molecule_class_args={ 'reference': reference, 'features': transcriptome_features, 'auto_set_intron_exon_features': True }, contig=contig)): # Read out mut spectrum consensus = molecule.get_consensus() if args.R2_based: molecule.strand = not molecule.strand # Invert becayse its R2 based. n_molecules_per_library[molecule.library] += 1 n_4su_mutations = 0 n_4su_contexts = 0 for (chrom, pos), base in consensus.items(): context = reference.fetch(chrom, pos - 1, pos + 2).upper() if len(context) != 3: continue if ((context[1] == 'A' and not molecule.strand) or (context[1] == 'T' and molecule.strand)): n_4su_contexts += 1 # Check if the base matches or the refence contains N's if context[1] == base or 'N' in context or len( context) != 3: continue # Ignore germline variants: if might_be_variant(chrom, pos, known): continue if not molecule.strand: # reverse template context = reverse_complement(context) base = complement(base) # Count 4SU specific mutations, and write to molecule later if context[1] == 'T' and base == 'C': n_4su_mutations += 1 conversions_per_library[molecule.library][(context, base)] += 1 # Write 4su modification to molecule molecule.set_meta('4S', n_4su_mutations) molecule.set_meta('4c', n_4su_contexts) # Set read color based on conversion rate: try: # The max color value will be 10% modification rate cfloat = colormap( np.clip(10 * (n_4su_mutations / n_4su_contexts), 0, 1))[:3] except Exception as e: cfloat = colormap._rgba_bad[:3] molecule.set_meta( 'YC', '%s,%s,%s' % tuple( (int(x * 255) for x in cfloat))) molecule.set_meta('4c', n_4su_contexts) molecule.write_tags() for fragment in molecule: rgid = fragment.get_read_group() if not rgid in read_groups: read_groups[rgid] = fragment.get_read_group( True)[1] # Write tagged molecule to output file molecule.write_pysam(out) except KeyboardInterrupt: # This allows you to cancel the analysis (CTRL+C) and get the current result pass return conversions_per_library, n_molecules_per_library, contig, temp_bam_path