def test_chic_cigar_dedup(self):
        i = 0
        with pysam.AlignmentFile('./data/chic_test_region.bam') as alignments:

            for molecule in MoleculeIterator(alignments, CHICMolecule,
                                             CHICFragment):
                i += 1

        self.assertEqual(i, 1)
示例#2
0
def recall_variants(args):

    variants, alignment_file_path, target_path, mode, germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size = args

    window_radius = 600
    MAX_REF_MOLECULES = 1_000  # Maximum amount of reference molecules to process.
    # This is capped for regions to which many reads map (mapping artefact)

    variant_calls = dict()  # cell->(chrom,pos) +/- ?

    ### Set up molecule iterator (1/2)
    if mode == 'NLA':
        mc = NlaIIIMolecule
        fc = NlaIIIFragment
    else:
        mc = Molecule
        fc = Fragment

    ###
    locations_done = set()
    alignments = pysam.AlignmentFile(alignment_file_path, threads=4)
    if germline_bam_path is not None:
        germline_alignments = pysam.AlignmentFile(germline_bam_path, threads=4)

    for variant in variants:

        # Check if the variant is present in the germline bam file (if supplied)
        if germline_bam_path is not None and has_variant_reads(
                germline_alignments,
                variant.chrom,
                variant.pos - 1,
                variant.alts[0],
                min_reads=1,
                stepper='nofilter'):
            print(f'FOUND IN GERMLINE {variant}')
            continue

        #print(variant)
        overlap = False
        reference_start = max(0, variant.pos - window_radius)
        reference_end = variant.pos + window_radius
        contig = variant.contig

        variant_key = (contig, variant.pos, variant.ref, variant.alts[0])

        #print(contig,reference_start,reference_end,variant.alts[0],variant.ref)
        ### Set up allele resolver
        unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver(
            use_cache=False, phased=False, verbose=True)

        if germline_variants_path is not None:
            with pysam.VariantFile(germline_variants_path) as germline:
                for i, ar_variant in enumerate(
                        germline.fetch(variant.chrom, reference_start,
                                       reference_end)):

                    if germline_variants_sample is None:
                        # If any of the samples is not heterozygous: continue
                        if any((ar_variant.samples[sample].alleles != 2
                                for sample in ar_variant.samples)):
                            continue
                    elif len(
                            set(ar_variant.samples[germline_variants_sample].
                                alleles)) != 2:
                        continue
                    unphased_allele_resolver.locationToAllele[
                        ar_variant.chrom][ar_variant.pos - 1] = {
                            ar_variant.alleles[0]: {'U'},
                            ar_variant.alleles[1]: {'V'}
                        }
        ####

        ref_phased = Counter()
        alt_phased = Counter()

        ### Set up molecule iterator (2/2)
        try:
            molecule_iter = MoleculeIterator(alignments,
                                             mc,
                                             fc,
                                             contig=contig,
                                             start=reference_start,
                                             end=reference_end,
                                             molecule_class_args={
                                                 'allele_resolver':
                                                 unphased_allele_resolver,
                                                 'max_associated_fragments':
                                                 20,
                                             },
                                             max_buffer_size=max_buffer_size)

            reference_called_molecules = []  # molecule, phase

            extracted_base_call_count = 0
            alt_call_count = 0
            for mi, molecule in enumerate(molecule_iter):
                base_call = get_molecule_base_calls(molecule, variant)
                if base_call is None:
                    continue
                extracted_base_call_count += 1
                base, quality = base_call
                call = None
                if base == variant.alts[0]:
                    call = 'A'
                    alt_call_count += 1
                    if molecule.sample not in variant_calls:
                        variant_calls[molecule.sample] = {}
                    variant_calls[molecule.sample][variant_key] = 1

                elif base == variant.ref:
                    call = 'R'

                if call is None:
                    continue

                # Obtain all germline variants which are phased :
                phased = get_phased_variants(molecule,
                                             unphased_allele_resolver)

                if call == 'R' and len(phased) > 0:
                    # If we can phase the alternative allele to a germline variant
                    # the reference calls can indicate absence
                    if len(reference_called_molecules) < MAX_REF_MOLECULES:
                        reference_called_molecules.append((molecule, phased))

                for chrom, pos, base in phased:
                    if call == 'A':
                        alt_phased[(chrom, pos, base)] += 1
                    elif call == 'R':
                        ref_phased[(chrom, pos, base)] += 1
        except MemoryError:
            print(f"Buffer exceeded for {variant.contig} {variant.pos}")
            continue

        #print(mi,extracted_base_call_count,alt_call_count)
        if len(alt_phased) > 0 and len(reference_called_molecules):
            # Clean the alt_phased variants for variants which are not >90% the same
            alt_phased_filtered = filter_alt_calls(alt_phased, 0.9)
            #print(alt_phased_filtered)
            for molecule, phased_gsnvs in reference_called_molecules:
                for p in phased_gsnvs:
                    if p in alt_phased_filtered:
                        if not molecule.sample in variant_calls:
                            variant_calls[molecule.sample] = {}
                        variant_calls[molecule.sample][variant_key] = 0
                        break
        locations_done.add(variant_key)
    alignments.close()
    return variant_calls, locations_done
 def Misc_contig_molecule_generator(molecule_iterator_args):
     for reference in input_bam.references:
         if not is_main_chromosome(reference):
             molecule_iterator_args['contig'] = reference
             yield from MoleculeIterator(**molecule_iterator_args)
def run_multiome_tagging(args):
    """
    Run multiome tagging adds molecule information

    Arguments:

        bamin (str) : bam file to process

        o(str) : path to output bam file

        method(str): Protocol to tag, select from:nla, qflag, chic, nla_transcriptome, vasa, cs, nla_taps ,chic_taps, nla_no_overhang, scartrace

        qflagger(str): Query flagging algorithm to use, this algorithm extracts UMI and sample information from your reads. When no query flagging algorithm is specified, the `singlecellmultiomics.universalBamTagger.universalBamTagger.QueryNameFlagger` is used

        method(str) : Method name, what kind of molecules need to be extracted. Select from:
            nla (Data with digested by Nla III enzyme)
            qflag (Only add basic tags like sampple and UMI, no molecule assignment)
            chic (Data digested using mnase fusion)
            nla_transcriptome (Data with transcriptome and genome digested by Nla III )
            vasa (VASA transcriptomic data)
            cs (CELseq data, 1 and 2)
            cs_feature_counts (Single end, deduplicate using a bam file tagged using featurecounts, deduplicates a umi per gene)
            fl_feature_counts (deduplicate using a bam file tagged using featurecounts, deduplicates based on fragment position)
            nla_taps (Data with digested by Nla III enzyme and methylation converted by TAPS)
            chic_taps (Data with digested by mnase enzyme and methylation converted by TAPS)
            chic_nla
            scartrace  (lineage tracing protocol)


        custom_flags(str): Arguments passed to the query name flagger, comma separated "MI,RX,bi,SM"

        ref(str) : Path to reference fasta file, autodected from bam header when not supplied

        umi_hamming_distance(int) : Max hamming distance on UMI's

        head (int) : Amount of molecules to process

        contig (str) : only process this contig

        region_start(int) : Zero based start coordinate of single region to process

        region_end(int) : Zero based end coordinate of single region to process, None: all contigs when contig is not set, complete contig when contig is set.

        alleles (str) : path to allele VCF

        allele_samples(str): Comma separated samples to extract from the VCF file. For example B6,SPRET

        unphased_alleles(str) : Path to VCF containing unphased germline SNPs

        mapfile (str) : 'Path to \*.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py

        annotmethod (int) : Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base

        cluster (bool) : Run contigs in separate cluster jobs

        resolve_unproperly_paired_reads(bool) : When enabled bamtagmultiome will look through the complete bam file in a hunt for the mate, the two mates will always end up in 1 molecule if both present in the bam file. This also works when the is_proper_pair bit is not set. Use this option when you want to find the breakpoints of genomic re-arrangements.

        no_rejects(bool) : Do not write rejected reads

        mem (int) : Amount of gigabytes to request for cluster jobs

        time(int) : amount of wall clock hours to request for cluster jobs

        exons(str): Path to exon annotation GTF file

        introns(str): Path to intron annotation GTF file

        consensus(bool) : Calculate molecule consensus read, this feature is _VERY_ experimental

        consensus_model(str) : Path to consensus calling model, when none specified, this is learned based on the supplied bam file, ignoring sites supplied by -consensus_mask_variants

        consensus_mask_variants(str): Path VCF file masked for training on consensus caller

        consensus_n_train(int) : Amount of bases used for training the consensus model

        no_source_reads(bool) :  Do not write original reads, only consensus

        scartrace_r1_primers(str) : comma separated list of R1 primers used in scartrace protocol


    """

    MISC_ALT_CONTIGS_SCMO = 'MISC_ALT_CONTIGS_SCMO'
    every_fragment_as_molecule = args.every_fragment_as_molecule
    skip_contig = set(args.skip_contig.split(',')) if args.skip_contig is not None else set()


    if not args.o.endswith('.bam'):
        raise ValueError(
            "Supply an output which ends in .bam, for example -o output.bam")

    write_status(args.o,'unfinished')

    # Verify wether the input file is indexed and sorted...
    if not args.ignore_bam_issues:
        verify_and_fix_bam(args.bamin)

    for remove_existing_path in [args.o, f'{args.o}.bai']:
        if os.path.exists(remove_existing_path):
            print(f"Removing existing file {remove_existing_path}")
            os.remove(remove_existing_path)

    input_bam = pysam.AlignmentFile(args.bamin, "rb", ignore_truncation=args.ignore_bam_issues, threads=4)

    # autodetect reference:
    reference = None
    if args.ref is None:
        args.ref = get_reference_from_pysam_alignmentFile(input_bam)

    if args.ref is not None:
        try:
            reference = CachedFasta(
                pysam.FastaFile(args.ref))
            print(f'Loaded reference from {args.ref}')
        except Exception as e:
            print("Error when loading the reference file, continuing without a reference")
            reference = None

    ##### Define fragment and molecule class arguments and instances: ####

    queryNameFlagger = None
    if args.qflagger is not None:
        if args.qflagger == 'custom_flags':
            queryNameFlagger = CustomAssingmentQueryNameFlagger(
                args.custom_flags.split(','))
        else:
            raise ValueError("Select from 'custom_flags, ..' ")

    molecule_class_args = {
        'umi_hamming_distance': args.umi_hamming_distance,
        'reference': reference
    }

    fragment_class_args = {
        'read_group_format' : args.read_group_format

    }
    yield_invalid = True  # if invalid reads should be written
    yield_overflow = True  # if overflow reads should be written

    if args.max_fragment_size is not None:
        fragment_class_args['max_fragment_size'] = args.max_fragment_size

    if args.no_rejects:
        yield_invalid = False

    if args.no_overflow:
        yield_overflow = False


    ignore_conversions = None
    if args.method == 'nla_taps' or args.method == 'chic_taps':
        ignore_conversions = set([('C', 'T'), ('G', 'A')])

    if args.alleles is not None:
        molecule_class_args['allele_resolver'] = singlecellmultiomics.alleleTools.AlleleResolver(
            args.alleles,
            select_samples=args.allele_samples.split(',') if args.allele_samples is not None else None,
            lazyLoad=True,
            use_cache=args.use_allele_cache,
            verbose = args.set_allele_resolver_verbose,
            ignore_conversions=ignore_conversions)

    if args.mapfile is not None:
        molecule_class_args['mapability_reader'] = MapabilityReader(
            args.mapfile)

    ### Transcriptome configuration ###
    if args.method in ('nla_transcriptome', 'cs', 'vasa'):
        print(
            colorama.Style.BRIGHT +
            'Running in transcriptome annotation mode' +
            colorama.Style.RESET_ALL)
        if args.exons is None :
            raise ValueError("Supply an exon GTF file")

        if args.introns is not None and args.exons is None:
            raise ValueError("Please supply both intron and exon GTF files")

        transcriptome_features = singlecellmultiomics.features.FeatureContainer()
        print("Loading exons", end='\r')
        transcriptome_features.loadGTF(
            args.exons,
            select_feature_type=['exon'],
            identifierFields=(
                'exon_id',
                'gene_id'),
            store_all=True,
            contig=args.contig,
            head=None)

        if args.introns is not None:
            print("Loading introns", end='\r')
            transcriptome_features.loadGTF(
                args.introns,
                select_feature_type=['intron'],
                identifierFields=['transcript_id'],
                store_all=True,
                contig=args.contig,
                head=None)
        print("All features loaded")

        # Add more molecule class arguments
        molecule_class_args.update({
            'features': transcriptome_features,
            'auto_set_intron_exon_features': True
        })

    ### Method specific configuration ###
    if args.method == 'qflag':
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.Fragment
        # Write all reads
        yield_invalid = True

    elif args.method == 'chic':
        moleculeClass = singlecellmultiomics.molecule.CHICMolecule
        fragmentClass = singlecellmultiomics.fragment.CHICFragment

    elif args.method == 'nla' or args.method == 'nla_no_overhang':
        moleculeClass = singlecellmultiomics.molecule.NlaIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        if args.method == 'nla_no_overhang':
            assert reference is not None, 'Supply a reference fasta using -ref!'
            fragment_class_args.update({
                    'reference': reference,
                    'no_overhang': True
                })

    elif args.method == 'chic_nla':
        moleculeClass=singlecellmultiomics.molecule.CHICNLAMolecule
        fragmentClass=singlecellmultiomics.fragment.CHICFragment
        assert reference is not None, 'Supply a reference fasta using -ref!'
        molecule_class_args.update({
                'reference': reference,
        })

    elif args.method == 'cs_feature_counts' :
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment

    elif args.method == 'fl_feature_counts':

        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsFullLengthFragment

    elif args.method == 'episeq' :
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment

    elif args.method == 'nla_transcriptome':
        moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        molecule_class_args.update({
            'pooling_method': 1,  # all data from the same cell can be dealt with separately
            'stranded': None  # data is not stranded
        })

    elif args.method == 'nla_taps':
        moleculeClass = singlecellmultiomics.molecule.TAPSNlaIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        molecule_class_args.update({
            'reference': reference,
            'taps': singlecellmultiomics.molecule.TAPS(reference=reference)
        })

    elif args.method == 'chic_taps':

        molecule_class_args.update({
            'reference': reference,
            'taps': singlecellmultiomics.molecule.TAPS(reference=reference)
        })
        moleculeClass = singlecellmultiomics.molecule.TAPSCHICMolecule
        fragmentClass = singlecellmultiomics.fragment.CHICFragment

    elif args.method == 'vasa' or args.method == 'cs':
        moleculeClass = singlecellmultiomics.molecule.VASA
        fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript

        molecule_class_args.update({
            'pooling_method': 1,  # all data from the same cell can be dealt with separately
            'stranded': 1  # data is stranded
        })

    elif args.method == 'scartrace':

        moleculeClass = singlecellmultiomics.molecule.ScarTraceMolecule
        fragmentClass = singlecellmultiomics.fragment.ScarTraceFragment

        r1_primers = args.scartrace_r1_primers.split(',')
        fragment_class_args.update({
                'scartrace_r1_primers': r1_primers,
                #'reference': reference
            })


    else:
        raise ValueError("Supply a valid method")

    # Allow or disallow cycle shift:
    if args.allow_cycle_shift and fragmentClass is singlecellmultiomics.fragment.NLAIIIFragment:
        fragment_class_args['allow_cycle_shift'] = True

    # This disables umi_cigar_processing:
    if args.no_umi_cigar_processing:
        fragment_class_args['no_umi_cigar_processing'] = True

    if args.max_associated_fragments is not None:
        molecule_class_args['max_associated_fragments'] = args.max_associated_fragments

    # This decides what molecules we will traverse
    if args.contig == MISC_ALT_CONTIGS_SCMO:
        contig = None
    else:
        contig = args.contig

    # This decides to only extract a single genomic region:
    if args.region_start is not None:
        if args.region_end is None:
            raise ValueError('When supplying -region_start then also supply -region_end')
        region_start = args.region_start
        region_end = args.region_end
    else:
        region_start = None
        region_end = None



    last_update = datetime.now()
    init_time = datetime.now()
    if args.molecule_iterator_verbosity_interval is not None and (args.molecule_iterator_verbose or (args.stats_file_path is not None )):

        stats_handle = None
        if args.stats_file_path is not None:
            stats_handle = open(args.stats_file_path,'w')

        def progress_callback_function( iteration, mol_iter, reads ):
            nonlocal last_update
            nonlocal init_time
            nonlocal stats_handle

            now = datetime.now()
            diff = (datetime.now()-last_update).total_seconds()
            if diff>args.molecule_iterator_verbosity_interval:

                diff_from_init = (datetime.now()-init_time).total_seconds()
                _contig, _pos = None, None
                for read in reads:
                    if read is not None:
                        _contig, _pos = read.reference_name, read.reference_start

                if args.molecule_iterator_verbose:
                    print( f'{mol_iter.yielded_fragments} fragments written, {mol_iter.deleted_fragments} fragments deleted ({(mol_iter.deleted_fragments/(mol_iter.deleted_fragments + mol_iter.yielded_fragments))*100:.2f} %), current pos: {_contig}, {_pos}, {mol_iter.waiting_fragments} fragments waiting             ' , end='\r')
                if stats_handle is not None:
                    stats_handle.write(f'{diff_from_init}\t{mol_iter.waiting_fragments}\t{mol_iter.yielded_fragments}\t{mol_iter.deleted_fragments}\t{_contig}\t{_pos}\n')
                    stats_handle.flush()
                last_update = now

    else:
        progress_callback_function = None



    molecule_iterator_args = {
        'alignments': input_bam,
        'queryNameFlagger': queryNameFlagger,
        'moleculeClass': moleculeClass,
        'fragmentClass': fragmentClass,
        'molecule_class_args': molecule_class_args,
        'fragment_class_args': fragment_class_args,
        'yield_invalid': yield_invalid,
        'yield_overflow': yield_overflow,
        'start':region_start,
        'end':region_end,
        'contig': contig,
        'every_fragment_as_molecule': every_fragment_as_molecule,
        'skip_contigs':skip_contig,
        'progress_callback_function':progress_callback_function
    }

    if args.resolve_unproperly_paired_reads:
        molecule_iterator_args['iterator_class'] = MatePairIteratorIncludingNonProper

    if args.contig == MISC_ALT_CONTIGS_SCMO:
        # When MISC_ALT_CONTIGS_SCMO is set as argument, all molecules with reads
        # mapping to a contig returning True from the is_main_chromosome
        # function are used

        def Misc_contig_molecule_generator(molecule_iterator_args):
            for reference in input_bam.references:
                if not is_main_chromosome(reference):
                    molecule_iterator_args['contig'] = reference
                    yield from MoleculeIterator(**molecule_iterator_args)

        molecule_iterator = Misc_contig_molecule_generator(
            molecule_iterator_args)
    else:
        molecule_iterator = MoleculeIterator(**molecule_iterator_args)

    #####
    consensus_model_path = None

    if args.consensus:
        # Load from path if available:

        if args.consensus_model is not None:
            if os.path.exists(args.consensus_model):
                model_path = args.consensus_model
            else:
                model_path = pkg_resources.resource_filename(
                    'singlecellmultiomics', f'molecule/consensus_model/{args.consensus_model}')

            if model_path.endswith('.h5'):
                try:
                    from tensorflow.keras.models import load_model
                except ImportError:
                    print("Please install tensorflow")
                    raise
                consensus_model = load_model(model_path)

            else:
                with open(model_path, 'rb') as f:
                    consensus_model = pickle.load(f)
        else:
            skip_already_covered_bases = not args.consensus_allow_train_location_oversampling
            if args.consensus_mask_variants is None:
                mask_variants = None
            else:
                mask_variants = pysam.VariantFile(args.consensus_mask_variants)
            print("Fitting consensus model, this may take a long time")
            consensus_model = singlecellmultiomics.molecule.train_consensus_model(
                molecule_iterator,
                mask_variants=mask_variants,
                n_train=args.consensus_n_train,
                skip_already_covered_bases=skip_already_covered_bases
                )
            # Write the consensus model to disk
            consensus_model_path = os.path.abspath(
                os.path.dirname(args.o)) + '/consensus_model.pickle.gz'
            print(f'Writing consensus model to {consensus_model_path}')
            with open(consensus_model_path, 'wb') as f:
                pickle.dump(consensus_model, f)

    # We needed to check if every argument is properly placed. If so; the jobs
    # can be sent to the cluster

    if args.cluster:
        if args.contig is None:
            write_status(args.o,'Submitting jobs. If this file remains, a job failed.')
            # Create jobs for all chromosomes:
            unique_id = str(uuid.uuid4())
            temp_prefix = os.path.abspath(os.path.dirname(
                args.o)) + '/SCMO_' + unique_id
            hold_merge = []

            ## Create folder to store cluster files:
            if args.clusterdir is None:
                cluster_file_folder = os.path.abspath(os.path.dirname(
                    args.o)) + '/cluster'
            else:
                cluster_file_folder = args.clusterdir
            print(f'Writing cluster scripts and standard out and error to {cluster_file_folder}')
            if not os.path.exists(cluster_file_folder):
                try:
                    os.makedirs(cluster_file_folder,exist_ok=True)
                except Exception as e:
                    print(e)
                    pass

            found_alts = 0
            files_to_merge = []
            for ci,chrom in enumerate([_chrom  for _chrom in
                        (list(input_bam.references) + [MISC_ALT_CONTIGS_SCMO])
                        if not _chrom in skip_contig]):

                if not is_main_chromosome(chrom):
                    found_alts += 1
                    continue
                if chrom == MISC_ALT_CONTIGS_SCMO and found_alts == 0:
                    continue

                temp_bam_path = f'{temp_prefix}_{chrom}.bam'

                if os.path.exists(temp_bam_path):
                    print(f"Removing existing temporary file {temp_bam_path}")
                    os.remove(temp_bam_path)

                arguments = " ".join(
                    [x for x in sys.argv if not x == args.o and x != '-o']) + f" -contig {chrom} -o {temp_bam_path}"
                files_to_merge.append(temp_bam_path)
                if consensus_model_path is not None:
                    arguments += f' -consensus_model {consensus_model_path}'
                job = f'SCMULTIOMICS_{ci}_{unique_id}'
                write_status(temp_bam_path,'SUBMITTED')
                job_id = submit_job(f'{arguments};', job_name=job, target_directory=cluster_file_folder,  working_directory=None,
                               threads_n=1, memory_gb=args.mem, time_h=args.time, scheduler=args.sched, copy_env=True,
                               email=None, mail_when_finished=False, hold=None,submit=True)


                print(f'Job for contig {chrom} submitted with job id: {job_id}')
                hold_merge.append(job_id)

            hold = hold_merge

            job = f'SCMULTIOMICS_MERGE_{unique_id}'

            if args.sched == 'local':
                hold = None

            final_status = args.o.replace('.bam','.status.txt')
            # Create list of output files
            command = f'samtools merge -@ 4 -c {args.o} {" ".join(files_to_merge)} && samtools index {args.o} && rm {temp_prefix}*.ba* && rm {temp_prefix}*.status.txt && echo "All done" > {final_status}'

            final_job_id = submit_job(f'{command};', job_name=job, target_directory=cluster_file_folder,  working_directory=None,
                           threads_n=4, memory_gb=10, time_h=args.time, scheduler=args.sched, copy_env=True,
                           email=None, mail_when_finished=False, hold=hold,submit=True)
            print(f'final job id is:{final_job_id}')
            exit()

    #####
    # Load unphased variants to memory
    unphased_allele_resolver = None
    if args.unphased_alleles is not None:
        unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver(
            use_cache=args.use_allele_cache,
            phased=False, ignore_conversions=ignore_conversions,verbose = args.set_allele_resolver_verbose)
        try:
            for i, variant in enumerate(
                pysam.VariantFile(
                    args.unphased_alleles).fetch(
                    args.contig)):
                if 'PASS' not in list(variant.filter):
                    continue
                if not all(
                        len(allele) == 1 for allele in variant.alleles) or len(
                        variant.alleles) != 2:
                    continue
                if sum([len(set(variant.samples[sample].alleles))
                        == 2 for sample in variant.samples]) < 2:
                    # Not heterozygous
                    continue

                unphased_allele_resolver.locationToAllele[variant.chrom][variant.pos - 1] = {
                    variant.alleles[0]: {'U'}, variant.alleles[1]: {'V'}}
        except Exception as e:  # todo catch this more nicely
            print(e)
    out_bam_path = args.o

    # Copy the header
    input_header = input_bam.header.as_dict()

    # Write provenance information to BAM header
    write_program_tag(
        input_header,
        program_name='bamtagmultiome',
        command_line=" ".join(
            sys.argv),
        version=singlecellmultiomics.__version__,
        description=f'SingleCellMultiOmics molecule processing, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')

    print(f'Started writing to {out_bam_path}')

    read_groups = dict()  # Store unique read groups in this dict
    with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups) as out:

        try:
            for i, molecule in enumerate(molecule_iterator):

                # Stop when enough molecules are processed
                if args.head is not None and (i - 1) >= args.head:
                    break

                # set unique molecule identifier
                molecule.set_meta('mi', f'{molecule.get_a_reference_id()}_{i}')

                # Write tag values
                molecule.write_tags()

                if unphased_allele_resolver is not None:  # write unphased allele tag:
                    molecule.write_allele_phasing_information_tag(
                        unphased_allele_resolver, 'ua')

                # Update read groups
                for fragment in molecule:
                    rgid = fragment.get_read_group()
                    if not rgid in read_groups:
                        read_groups[rgid] = fragment.get_read_group(True)[1]

                # Calculate molecule consensus
                if args.consensus:
                    try:
                        consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced(
                            out,
                            f'consensus_{molecule.get_a_reference_id()}_{i}',
                            consensus_model,
                            NUC_RADIUS=args.consensus_k_rad
                            )
                        for consensus_read in consensus_reads:
                            consensus_read.set_tag('RG', molecule[0].get_read_group())
                            consensus_read.set_tag('mi', i)
                            out.write(consensus_read)
                    except Exception as e:

                        #traceback.print_exc()
                        #print(e)
                        molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True)
                        molecule.write_pysam(out)


                # Write the reads to the output file
                if not args.no_source_reads:
                    molecule.write_pysam(out)
        except Exception as e:
            write_status(args.o,'FAIL, The file is not complete')
            raise e

        # Reached the end of the generator
        write_status(args.o,'Reached end. All ok!')
        '-head',
        type=int,
        help=
        'Amount of random sequences to count, when not specified all random primers are counted'
    )
    argparser.add_argument('-min_mq', type=int, default=50)
    argparser.add_argument('-o',
                           type=str,
                           default='./randomer_usage.pickle.gz',
                           help='Output pickle/csv path')
    args = argparser.parse_args()

    with pysam.AlignmentFile(args.bamfile) as alignments:
        molecule_source = MoleculeIterator(
            alignments,
            molecule_class=NlaIIIMolecule,
            fragment_class=NlaIIIFragment,
        )

        qf = get_random_primer_histogram(molecule_source,
                                         args.min_mq,
                                         args.max_size,
                                         args.size_bin_size,
                                         head=args.head)

        print('Writing dataframe to disk')
        if args.o.endswith('csv') or args.o.endswith('csv.gz'):
            qf.to_csv(args.o)
        else:
            qf.to_pickle(args.o)
        print('All done')
示例#6
0
    def obtain_conversions(contig: str):
        """ Create conversion dictionary for the suppled contig

        Args:
            contig (str)

        Returns:
            conversions_per_library (defaultdict( conversion_dict_stranded ) ) : Per library conversion dictionary
            n_molecules_per_library (Counter) : observed molecules per library
            contig(str) : the contig passed to the method
            temp_bam_path(str) : path to tagged bam file, tagged with gene annotations and 4su mutation count

        """

        conversions_per_library = defaultdict(conversion_dict_stranded)
        n_molecules_per_library = Counter()

        from singlecellmultiomics.molecule import might_be_variant

        # Create temp directory to write tagged bam file to:
        temp_dir = args.temp_dir
        temp_bam_path = f'{temp_dir}/{contig}.bam'
        if not os.path.exists(temp_dir):
            try:
                os.makedirs(temp_dir)
            except Exception as e:
                pass

        # Load gene annotations for the selected contig:
        transcriptome_features = FeatureContainer()
        transcriptome_features.loadGTF(path=exons_gtf_path,
                                       select_feature_type=['exon'],
                                       identifierFields=('exon_id', 'gene_id'),
                                       store_all=True,
                                       contig=contig,
                                       head=None)

        transcriptome_features.loadGTF(path=introns_gtf_path,
                                       select_feature_type=['intron'],
                                       identifierFields=['transcript_id'],
                                       store_all=True,
                                       contig=contig,
                                       head=None)

        colormap = plt.get_cmap('RdYlBu_r')
        colormap.set_bad((0, 0, 0))

        read_groups = {}
        try:
            with pysam.AlignmentFile(single_cell_bam_path, threads=4) as alignments, \
                 pysam.VariantFile(known_vcf_path) as known, \
                 sorted_bam_file(temp_bam_path, origin_bam=single_cell_bam_path, read_groups=read_groups, fast_compression=True) as out, \
                 pysam.FastaFile(reference_path) as reference_handle:

                # Cache the sequence of the contig: (faster)
                reference = CachedFasta(reference_handle)

                for n_molecules, molecule in enumerate(
                        MoleculeIterator(alignments,
                                         TranscriptMolecule,
                                         SingleEndTranscriptFragment,
                                         fragment_class_args={
                                             'stranded': True,
                                             'features': transcriptome_features
                                         },
                                         molecule_class_args={
                                             'reference': reference,
                                             'features':
                                             transcriptome_features,
                                             'auto_set_intron_exon_features':
                                             True
                                         },
                                         contig=contig)):
                    # Read out mut spectrum
                    consensus = molecule.get_consensus()
                    if args.R2_based:
                        molecule.strand = not molecule.strand  # Invert becayse its R2 based.
                    n_molecules_per_library[molecule.library] += 1

                    n_4su_mutations = 0
                    n_4su_contexts = 0

                    for (chrom, pos), base in consensus.items():
                        context = reference.fetch(chrom, pos - 1,
                                                  pos + 2).upper()
                        if len(context) != 3:
                            continue

                        if ((context[1] == 'A' and not molecule.strand)
                                or (context[1] == 'T' and molecule.strand)):
                            n_4su_contexts += 1

                        # Check if the base matches or the refence contains N's
                        if context[1] == base or 'N' in context or len(
                                context) != 3:
                            continue

                        # Ignore germline variants:
                        if might_be_variant(chrom, pos, known):
                            continue

                        if not molecule.strand:  # reverse template
                            context = reverse_complement(context)
                            base = complement(base)

                        # Count 4SU specific mutations, and write to molecule later
                        if context[1] == 'T' and base == 'C':
                            n_4su_mutations += 1

                        conversions_per_library[molecule.library][(context,
                                                                   base)] += 1

                    # Write 4su modification to molecule
                    molecule.set_meta('4S', n_4su_mutations)
                    molecule.set_meta('4c', n_4su_contexts)
                    # Set read color based on conversion rate:

                    try:
                        # The max color value will be 10% modification rate
                        cfloat = colormap(
                            np.clip(10 * (n_4su_mutations / n_4su_contexts), 0,
                                    1))[:3]
                    except Exception as e:
                        cfloat = colormap._rgba_bad[:3]
                    molecule.set_meta(
                        'YC', '%s,%s,%s' % tuple(
                            (int(x * 255) for x in cfloat)))

                    molecule.set_meta('4c', n_4su_contexts)
                    molecule.write_tags()

                    for fragment in molecule:
                        rgid = fragment.get_read_group()
                        if not rgid in read_groups:
                            read_groups[rgid] = fragment.get_read_group(
                                True)[1]

                    # Write tagged molecule to output file
                    molecule.write_pysam(out)

        except KeyboardInterrupt:
            # This allows you to cancel the analysis (CTRL+C) and get the current result
            pass

        return conversions_per_library, n_molecules_per_library, contig, temp_bam_path