def count_transcripts(cargs):
    args, contig = cargs
    if args.alleles is not None:
        allele_resolver = alleleTools.AlleleResolver(
            args.alleles, lazyLoad=(not args.loadAllelesToMem))
    else:
        allele_resolver = None

    contig_mapping = None

    if args.contigmapping == 'danio':
        contig_mapping = {
            '1': 'CM002885.2',
            '2': 'CM002886.2',
            '3': 'CM002887.2',
            '4': 'CM002888.2',
            '5': 'CM002889.2',

            '6': 'CM002890.2',
            '7': 'CM002891.2',
            '8': 'CM002892.2',
            '9': 'CM002893.2',
            '10': 'CM002894.2',
            '11': 'CM002895.2',
            '12': 'CM002896.2',
            '13': 'CM002897.2',
            '14': 'CM002898.2',
            '15': 'CM002899.2',

            '16': 'CM002900.2',
            '17': 'CM002901.2',
            '18': 'CM002902.2',
            '19': 'CM002903.2',
            '20': 'CM002904.2',
            '21': 'CM002905.2',
            '22': 'CM002906.2',
            '23': 'CM002907.2',
            '24': 'CM002908.2',
            '25': 'CM002909.2',
        }

    # Load features
    contig_mapping = None
    #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon)
    features = singlecellmultiomics.features.FeatureContainer()
    if contig_mapping is not None:
        features.remapKeys = contig_mapping
    features.loadGTF(
        args.gtfexon,
        select_feature_type=['exon'],
        identifierFields=(
            'exon_id',
            'transcript_id'),
        store_all=True,
        head=args.hf,
        contig=contig)
    features.loadGTF(
        args.gtfintron,
        select_feature_type=['intron'],
        identifierFields=['transcript_id'],
        store_all=True,
        head=args.hf,
        contig=contig)

    # What is used for assignment of molecules?
    if args.method == 'nla':
        moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment
        pooling_method = 1  # all data from the same cell can be dealt with separately
        stranded = None  # data is not stranded
    elif args.method == 'vasa' or args.method == 'cs':
        moleculeClass = singlecellmultiomics.molecule.VASA
        fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript
        pooling_method = 1
        stranded = 1  # data is stranded, mapping to other strand
    else:
        raise ValueError("Supply a valid method")

    # COUNT:
    exon_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    intron_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    junction_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    gene_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount

    gene_set = set()
    sample_set = set()
    annotated_molecules = 0
    read_molecules = 0
    if args.producebam:
        bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam'
        with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments:
            output_bam = pysam.AlignmentFile(
                bam_path_produced, "wb", header=alignments.header)

    ref = None
    if args.ref is not None:
        ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref))

    for alignmentfile_path in args.alignmentfiles:

        i = 0
        with pysam.AlignmentFile(alignmentfile_path) as alignments:
            molecule_iterator = MoleculeIterator(
                alignments=alignments,
                check_eject_every=5000,
                moleculeClass=moleculeClass,
                molecule_class_args={
                    'features': features,
                    'stranded': stranded,
                    'min_max_mapping_quality': args.minmq,
                    'reference': ref,
                    'allele_resolver': allele_resolver
                },

                fragmentClass=fragmentClass,
                fragment_class_args={
                    'umi_hamming_distance': args.umi_hamming_distance,
                    'R1_primer_length': 4,
                    'R2_primer_length': 6},
                perform_qflag=True,
                # when the reads have not been tagged yet, this flag is very
                # much required
                pooling_method=pooling_method,
                contig=contig
            )

            for i, molecule in enumerate(molecule_iterator):
                if not molecule.is_valid():
                    if args.producebam:
                        molecule.write_tags()
                        molecule.write_pysam(output_bam)
                    continue

                molecule.annotate(args.annotmethod)
                molecule.set_intron_exon_features()

                if args.producebam:
                    molecule.write_tags()
                    molecule.write_pysam(output_bam)

                allele = None
                if allele_resolver is not None:
                    allele = molecule.allele
                    if allele is None:
                        allele = 'noAllele'

                # Obtain total count introns/exons reduce it so the sum of the
                # count will be 1:
                # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions)
                total_count_for_molecule = len(molecule.genes)
                if total_count_for_molecule == 0:
                    continue  # we didn't find  any gene counts

                # Distibute count over amount of gene hits:
                count_to_add = 1 / total_count_for_molecule
                for gene in molecule.genes:
                    if allele is not None:
                        gene = f'{allele}_{gene}'
                    gene_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)
                    sample_set.add(molecule.get_sample())

                # Obtain introns/exons/splice junction information:
                for intron in molecule.introns:
                    gene = intron
                    if allele is not None:
                        gene = f'{allele}_{intron}'
                    intron_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for exon in molecule.exons:
                    gene = exon
                    if allele is not None:
                        gene = f'{allele}_{exon}'
                    exon_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for junction in molecule.junctions:
                    gene = junction
                    if allele is not None:
                        gene = f'{allele}_{junction}'
                    junction_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                annotated_molecules += 1
                if args.head and (i + 1) > args.head:
                    print(
                        f"-head was supplied, {i} molecules discovered, stopping")
                    break

        read_molecules += i

    if args.producebam:
        output_bam.close()
        final_bam_path = bam_path_produced.replace('.unsorted', '')
        sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True)

    return (
        gene_set,
        sample_set,
        gene_counts_per_cell,
        junction_counts_per_cell,
        exon_counts_per_cell,
        intron_counts_per_cell,
        annotated_molecules,
        read_molecules,
        contig

    )
示例#2
0
    try:
        for i, molecule in enumerate(
                singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=alignments,
                    moleculeClass=moleculeClass,
                    yield_invalid=(output is not None),
                    fragmentClass=fragmentClass,
                    fragment_class_args={'umi_hamming_distance': 1},
                    molecule_class_args=molecule_class_args,
                    contig=args.contig)):

            if args.head and (i - 1) >= args.head:
                break

            if not molecule.is_valid(set_rejection_reasons=True):
                if output is not None:
                    molecule.write_pysam(output)
                continue

            # Skip sample if not selected
            if samples is not None and molecule.sample not in samples:
                molecule.set_rejection_reason('sample_not_selected')
                if output is not None:
                    molecule.write_pysam(output)
                continue

            for (chromosome,
                 location), call in molecule.methylation_call_dict.items():
                if call['context'] == '.':  # Only print calls concerning C's
                    continue
            statistics['Input']['molecules'] += 1
            statistics['Input']['fragments'] += len(molecule)

            # Set (chromosome) unique identifier
            molecule.set_meta('mi', f'NLA_{i}')
            if args.transcriptome:
                molecule.set_intron_exon_features()

            if samples is not None and molecule.sample not in samples:
                molecule.set_rejection_reason('sample_not_selected')
                if output is not None:
                    molecule.write_pysam(output)
                continue

            if args.transcriptome:
                if not molecule.is_valid():
                    if molecule.is_multimapped(
                    ) or molecule.get_max_mapping_qual() < args.min_mq:
                        molecule.set_meta('RF', 'rejected_molecule_mq')
                        molecule.write_tags()
                        molecule.write_pysam(output)
                        statistics['Filtering']['low mapping quality'] += 1
                        statistics['Filtering']['rejected'] += 1
                        continue

                    rejected_reads.append(molecule[0].reads)
                    continue
                statistics['Filtering'][f'valid {args.method} molecule'] += 1
                if len(molecule.junctions):
                    molecule.set_meta('RF', 'transcript_junction')
                    molecule.set_meta('dt', 'RNA')