def count_transcripts(cargs): args, contig = cargs if args.alleles is not None: allele_resolver = alleleTools.AlleleResolver( args.alleles, lazyLoad=(not args.loadAllelesToMem)) else: allele_resolver = None contig_mapping = None if args.contigmapping == 'danio': contig_mapping = { '1': 'CM002885.2', '2': 'CM002886.2', '3': 'CM002887.2', '4': 'CM002888.2', '5': 'CM002889.2', '6': 'CM002890.2', '7': 'CM002891.2', '8': 'CM002892.2', '9': 'CM002893.2', '10': 'CM002894.2', '11': 'CM002895.2', '12': 'CM002896.2', '13': 'CM002897.2', '14': 'CM002898.2', '15': 'CM002899.2', '16': 'CM002900.2', '17': 'CM002901.2', '18': 'CM002902.2', '19': 'CM002903.2', '20': 'CM002904.2', '21': 'CM002905.2', '22': 'CM002906.2', '23': 'CM002907.2', '24': 'CM002908.2', '25': 'CM002909.2', } # Load features contig_mapping = None #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon) features = singlecellmultiomics.features.FeatureContainer() if contig_mapping is not None: features.remapKeys = contig_mapping features.loadGTF( args.gtfexon, select_feature_type=['exon'], identifierFields=( 'exon_id', 'transcript_id'), store_all=True, head=args.hf, contig=contig) features.loadGTF( args.gtfintron, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, head=args.hf, contig=contig) # What is used for assignment of molecules? if args.method == 'nla': moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment pooling_method = 1 # all data from the same cell can be dealt with separately stranded = None # data is not stranded elif args.method == 'vasa' or args.method == 'cs': moleculeClass = singlecellmultiomics.molecule.VASA fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript pooling_method = 1 stranded = 1 # data is stranded, mapping to other strand else: raise ValueError("Supply a valid method") # COUNT: exon_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount intron_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount junction_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_set = set() sample_set = set() annotated_molecules = 0 read_molecules = 0 if args.producebam: bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam' with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments: output_bam = pysam.AlignmentFile( bam_path_produced, "wb", header=alignments.header) ref = None if args.ref is not None: ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref)) for alignmentfile_path in args.alignmentfiles: i = 0 with pysam.AlignmentFile(alignmentfile_path) as alignments: molecule_iterator = MoleculeIterator( alignments=alignments, check_eject_every=5000, moleculeClass=moleculeClass, molecule_class_args={ 'features': features, 'stranded': stranded, 'min_max_mapping_quality': args.minmq, 'reference': ref, 'allele_resolver': allele_resolver }, fragmentClass=fragmentClass, fragment_class_args={ 'umi_hamming_distance': args.umi_hamming_distance, 'R1_primer_length': 4, 'R2_primer_length': 6}, perform_qflag=True, # when the reads have not been tagged yet, this flag is very # much required pooling_method=pooling_method, contig=contig ) for i, molecule in enumerate(molecule_iterator): if not molecule.is_valid(): if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) continue molecule.annotate(args.annotmethod) molecule.set_intron_exon_features() if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) allele = None if allele_resolver is not None: allele = molecule.allele if allele is None: allele = 'noAllele' # Obtain total count introns/exons reduce it so the sum of the # count will be 1: # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions) total_count_for_molecule = len(molecule.genes) if total_count_for_molecule == 0: continue # we didn't find any gene counts # Distibute count over amount of gene hits: count_to_add = 1 / total_count_for_molecule for gene in molecule.genes: if allele is not None: gene = f'{allele}_{gene}' gene_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) sample_set.add(molecule.get_sample()) # Obtain introns/exons/splice junction information: for intron in molecule.introns: gene = intron if allele is not None: gene = f'{allele}_{intron}' intron_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for exon in molecule.exons: gene = exon if allele is not None: gene = f'{allele}_{exon}' exon_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for junction in molecule.junctions: gene = junction if allele is not None: gene = f'{allele}_{junction}' junction_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) annotated_molecules += 1 if args.head and (i + 1) > args.head: print( f"-head was supplied, {i} molecules discovered, stopping") break read_molecules += i if args.producebam: output_bam.close() final_bam_path = bam_path_produced.replace('.unsorted', '') sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True) return ( gene_set, sample_set, gene_counts_per_cell, junction_counts_per_cell, exon_counts_per_cell, intron_counts_per_cell, annotated_molecules, read_molecules, contig )
try: for i, molecule in enumerate( singlecellmultiomics.molecule.MoleculeIterator( alignments=alignments, moleculeClass=moleculeClass, yield_invalid=(output is not None), fragmentClass=fragmentClass, fragment_class_args={'umi_hamming_distance': 1}, molecule_class_args=molecule_class_args, contig=args.contig)): if args.head and (i - 1) >= args.head: break if not molecule.is_valid(set_rejection_reasons=True): if output is not None: molecule.write_pysam(output) continue # Skip sample if not selected if samples is not None and molecule.sample not in samples: molecule.set_rejection_reason('sample_not_selected') if output is not None: molecule.write_pysam(output) continue for (chromosome, location), call in molecule.methylation_call_dict.items(): if call['context'] == '.': # Only print calls concerning C's continue
statistics['Input']['molecules'] += 1 statistics['Input']['fragments'] += len(molecule) # Set (chromosome) unique identifier molecule.set_meta('mi', f'NLA_{i}') if args.transcriptome: molecule.set_intron_exon_features() if samples is not None and molecule.sample not in samples: molecule.set_rejection_reason('sample_not_selected') if output is not None: molecule.write_pysam(output) continue if args.transcriptome: if not molecule.is_valid(): if molecule.is_multimapped( ) or molecule.get_max_mapping_qual() < args.min_mq: molecule.set_meta('RF', 'rejected_molecule_mq') molecule.write_tags() molecule.write_pysam(output) statistics['Filtering']['low mapping quality'] += 1 statistics['Filtering']['rejected'] += 1 continue rejected_reads.append(molecule[0].reads) continue statistics['Filtering'][f'valid {args.method} molecule'] += 1 if len(molecule.junctions): molecule.set_meta('RF', 'transcript_junction') molecule.set_meta('dt', 'RNA')