def check_already_decontaminated(sample, fragment, PCR):
    """Check from the summary and output file whether decontamination has been done"""
    import os
    from hivwholeseq.utils.mapping import get_number_reads

    if (fragment == "F4") and (sample.name in maj_contnames):
        return True

    fn_sum = get_decontaminate_summary_filename(sample.patient, sample.name, fragment, PCR=PCR)
    fn_in = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=False)
    fn_out = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=True)

    if not all(map(os.path.isfile, [fn_sum, fn_in, fn_out])):
        print sample.patient, sample.name, fragment, PCR
        return False

    n_reads_in = get_number_reads(fn_in)
    n_reads_out = get_number_reads(fn_out)
    (n_reads_good, n_reads_cont) = get_number_reads_summary(fn_sum)

    if n_reads_out != n_reads_good:
        print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont
        return False

    if n_reads_good < 0.5 * n_reads_in:
        print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont
        return False

    return True
예제 #2
0
def check_already_decontaminated(sample, fragment, PCR):
    '''Check from the summary and output file whether decontamination has been done'''
    import os
    from hivwholeseq.utils.mapping import get_number_reads

    if (fragment == 'F4') and (sample.name in maj_contnames):
        return True

    fn_sum = get_decontaminate_summary_filename(sample.patient,
                                                sample.name,
                                                fragment,
                                                PCR=PCR)
    fn_in = sample.get_mapped_filtered_filename(fragment,
                                                PCR=PCR,
                                                decontaminated=False)
    fn_out = sample.get_mapped_filtered_filename(fragment,
                                                 PCR=PCR,
                                                 decontaminated=True)

    if not all(map(os.path.isfile, [fn_sum, fn_in, fn_out])):
        print sample.patient, sample.name, fragment, PCR
        return False

    n_reads_in = get_number_reads(fn_in)
    n_reads_out = get_number_reads(fn_out)
    (n_reads_good, n_reads_cont) = get_number_reads_summary(fn_sum)

    if n_reads_out != n_reads_good:
        print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont
        return False

    if n_reads_good < 0.5 * n_reads_in:
        print sample.patient, sample.name, fragment, PCR, n_reads_in, n_reads_out, n_reads_good, n_reads_cont
        return False

    return True
예제 #3
0
def get_coallele_counts_from_file(bamfilename,
                                  length,
                                  qual_min=30,
                                  maxreads=-1,
                                  VERBOSE=0,
                                  use_tests=False):
    '''Get counts of join occurence of two alleles'''
    from .utils.mapping import (test_read_pair_exotic_cigars,
                                test_read_pair_exceed_reference)

    if VERBOSE >= 1:
        print 'Getting coallele counts'

    # Precompute conversion tables
    alpha_mapping = {a: alphal.index(a) for a in alphal}
    SANGER_SCORE_OFFSET = ord("!")
    q_mapping = dict()
    for letter in xrange(0, 255):
        q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET

    if VERBOSE >= 2:
        print 'Initializing matrix of cocounts'

    # NOTE: we are ignoring fwd/rev and read1/2
    counts = np.zeros((len(alpha), len(alpha), length, length), int)
    posall = np.zeros(1000, dtype=[('pos', int), ('aind', int)])

    if VERBOSE >= 2:
        from hivwholeseq.utils.mapping import get_number_reads
        print 'Scanning read pairs (' + str(
            get_number_reads(bamfilename) // 2) + ')'

    # NOTE: the reads should already be filtered of unmapped stuff at this point
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for ir, reads in enumerate(pair_generator(bamfile)):
            if ir == maxreads:
                if VERBOSE:
                    print 'Max read number reached:', maxreads
                break

            if (VERBOSE >= 2) and (not ((ir + 1) % 10)):
                if (VERBOSE == 2) and (ir + 1 != 10):
                    sys.stdout.write("\x1b[1A")
                print(ir + 1)

            if use_tests:
                if test_read_pair_exotic_cigars(reads):
                    raise ValueError('CIGAR type ' + str(bt) +
                                     ' not recognized')

                if test_read_pair_exceed_reference(reads, length):
                    raise ValueError('Read pair exceeds reference length of ' +
                                     str(length))

            # Temp structures
            posall[:] = (-1, -1)
            iall = 0

            # Collect alleles
            for read in reads:
                alleles_ind = np.fromiter((alpha_mapping[x] for x in read.seq),
                                          np.uint8, len(read.seq))
                allqual_ind = np.fromiter((q_mapping[x] for x in read.qual),
                                          np.uint8, len(read.qual))

                pos_ref = read.pos
                pos_read = 0
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl
                    elif bt == 2:
                        # NOTE: no CIGAR can start NOR end with a deletion
                        qual_deletion = allqual_ind[pos_read:pos_read +
                                                    2].min()
                        if qual_deletion >= qual_min:
                            posall['pos'][iall:iall + bl] = np.arange(
                                pos_ref, pos_ref + bl)
                            posall['aind'][iall:iall + bl] = 4
                            iall += bl
                        pos_ref += bl
                    else:
                        alleles_indb = alleles_ind[pos_read:pos_read + bl]
                        allqual_indb = allqual_ind[pos_read:pos_read + bl]
                        for i in xrange(len(alpha)):
                            aitmp = (alleles_indb
                                     == i) & (allqual_indb >= qual_min)
                            aitmp = aitmp.nonzero()[0] + pos_ref
                            aitmplen = len(aitmp)
                            posall['pos'][iall:iall + aitmplen] = aitmp
                            posall['aind'][iall:iall + aitmplen] = i
                            iall += aitmplen

                        pos_read += bl
                        pos_ref += bl

            # Avoid doubles (paired reads are twice the same biological molecule)
            posall.sort(order=('pos', 'aind'))
            iall = (posall['pos'] != -1).nonzero()[0][0]
            while iall < len(posall) - 1:
                if posall['pos'][iall + 1] == posall['pos'][iall]:

                    # If both reads agree @ an overlap allele, take a single call
                    if posall['aind'][iall + 1] == posall['aind'][iall]:
                        posall[iall + 1] = (-1, -1)

                    # else, pick one at random (FIXME: pick the highest phred)
                    else:
                        ibin = np.random.randint(2)
                        posall[iall + ibin] = (-1, -1)

                    iall += 1
                iall += 1

            # Add allele cocounts to the matrix
            # NOTE: this already takes care of the symmetry
            poss = [
                posall['pos'][posall['aind'] == i1]
                for i1 in xrange(len(alpha))
            ]
            for i1 in xrange(len(alpha)):
                poss1 = poss[i1]
                if not len(poss1):
                    continue
                for i2 in xrange(len(alpha)):
                    poss2 = poss[i2]
                    if not len(poss2):
                        continue

                    # Raveling vodoo for efficiency
                    cobra = counts[i1, i2].ravel()
                    ind = poss1.repeat(len(poss2)) * length + np.tile(
                        poss2, len(poss1))
                    cobra[ind] += 1

    return counts
예제 #4
0
def check_status(sample, step, detail=1):
    '''Check for a sample a certain step of the pipeline at a certain detail'''
    if detail == 1:
        if step == 'premapped':
            return [os.path.isfile(sample.get_premapped_filename())]
        elif step == 'divided':
            return [(fr, os.path.isfile(sample.get_divided_filename(fr)))
                    for fr in sample.regions_complete]
        elif step == 'consensus':
            return [(fr, os.path.isfile(sample.get_consensus_filename(fr)))
                    for fr in sample.regions_generic]
        elif step == 'mapped':
            return [
                (fr,
                 os.path.isfile(sample.get_mapped_filename(fr,
                                                           filtered=False)))
                for fr in sample.regions_generic
            ]
        elif step == 'filtered':
            return [
                (fr,
                 os.path.isfile(sample.get_mapped_filename(fr, filtered=True)))
                for fr in sample.regions_generic
            ]
        elif step == 'mapped_initial':
            return [(fr,
                     os.path.isfile(sample.get_mapped_to_initial_filename(fr)))
                    for fr in sample.regions_generic]
        elif step == 'mapped_filtered':
            # Check whether the mapped filtered is older than the mapped_initial
            from hivwholeseq.utils.generic import modification_date
            out = []
            for fr in sample.regions_generic:
                fn_mi = sample.get_mapped_to_initial_filename(fr)
                fn_mf = sample.get_mapped_filtered_filename(fr)
                if not os.path.isfile(fn_mf):
                    out.append((fr, False))
                    continue

                if not os.path.isfile(fn_mi):
                    out.append((fr, True))
                    continue

                md_mi = modification_date(fn_mi)
                md_mf = modification_date(fn_mf)
                if md_mf < md_mi:
                    out.append((fr, 'OLD'))
                else:
                    out.append((fr, True))
            return out

    elif detail == 2:
        if step in ('filtered', 'consensus'):
            return check_status(sample, step, detail=3)
        else:
            return check_status(sample, step, detail=1)

    elif detail == 3:
        if step == 'premapped':
            if os.path.isfile(sample.get_premapped_filename()):
                return [get_number_reads(sample.get_premapped_filename())]
            else:
                return [False]

        elif step == 'divided':
            stati = []
            for fr in sample.regions_complete:
                fn = sample.get_divided_filename(fr)
                if os.path.isfile(fn):
                    status = (fr, get_number_reads(fn))
                else:
                    status = (fr, False)
                stati.append(status)
            return stati

        elif step == 'consensus':
            stati = []
            for fr in sample.regions_generic:
                fn = sample.get_consensus_filename(fr)
                if os.path.isfile(fn):
                    status = (fr, len(SeqIO.read(fn, 'fasta')))
                else:
                    status = (fr, False)
                stati.append(status)
            return stati

        elif step == 'mapped':
            stati = []
            for fr in sample.regions_generic:
                fn = sample.get_mapped_filename(fr, filtered=False)
                if os.path.isfile(fn):
                    status = (fr, get_number_reads(fn))
                else:
                    status = (fr, False)
                stati.append(status)
            return stati

        elif step == 'filtered':
            stati = []
            for fr in sample.regions_generic:
                fn = sample.get_mapped_filename(fr, filtered=True)
                if os.path.isfile(fn):
                    status = (fr, get_number_reads(fn))
                else:
                    status = (fr, False)
                stati.append(status)
            return stati

        # TODO: add mapped_to_initial and downstream
        elif step in ('mapped_initial', 'mapped_filtered'):
            return check_status(sample, step, detail=1)
def filter_contamination(
    bamfilename,
    bamfilename_out,
    contseqs,
    samplename,
    VERBOSE=0,
    deltascore_max_self=60,
    deltascore_max_other=24,
    maxreads=-1,
    **kwargs
):
    """Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    """
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if "score_match" in kwargs:
        score_match = kwargs["score_match"]
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam"

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")"

    with pysam.Samfile(bamfilename, "rb") as bamfile:
        with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile(
            bamfilename_trash, "wb", template=bamfile
        ) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write("\x1b[1A")
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print "Read is very close to its own consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read")
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print "Read is closest to its consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                    # The read may come from another consensus (contamination)
                    elif delta_read <= deltascore_max_other:
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print "Contaminated read found! Good:", n_good, "cont:", sum(
                                n_cont.itervalues()
                            ), "sources:", n_cont

                        if VERBOSE >= 3:
                            print "Read is contaminated by", contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read")
                            print ""
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                        if VERBOSE >= 2:
                            print ""

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print "Read is close to nothing really", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
예제 #6
0
def print_info_genomewide(p, title, name, method, VERBOSE=0, require_all=True):
    '''Pretty printer for patient pipeline info'''

    mod_dates = p.mod_dates

    def check_requisite_genomewide(md,
                                   name_requisite,
                                   samplename,
                                   mod_dates,
                                   require_all=True):
        '''Check requisites for genomewide observables'''
        stati = []
        fragments = ['F' + str(i + 1) for i in xrange(6)]
        for fragment in fragments:
            if (name_requisite, fragment, samplename) not in mod_dates:
                stati.append('MISS')
            elif md < mod_dates[(name_requisite, fragment, samplename)]:
                stati.append('OLD')
            else:
                stati.append('OK')

        if 'OLD' in stati:
            return 'OLD'
        else:
            if require_all:
                if 'MISS' in stati:
                    return 'MISS'
                else:
                    return 'OK'
            else:
                if 'OK' in stati:
                    return 'OK'
                else:
                    return 'MISS'

    def check_contamination_genomewide(sample):
        '''Check whether any of the fragment samples is contaminated'''
        fragments = ['F' + str(i + 1) for i in xrange(6)]
        for fragment in fragments:
            if 'contaminated' in sample[fragment]:
                return True
        return False

    import os, sys
    from hivwholeseq.patients.samples import SamplePat

    # NOTE: this function is used to check both entire patients and single samples
    if isinstance(p, SamplePat):
        sample_iter = [(p.name, p)]
    else:
        sample_iter = p.samples.iterrows()

    stati = set()
    line = ('{:<' + str(title_len) + '}').format(title + ':')
    print line
    for samplename, sample in sample_iter:
        sample = SamplePat(sample)
        title = sample.name
        line = ('{:<' + str(title_len) + '}').format(title + ':')

        if isinstance(method, basestring) and hasattr(sample, method):
            fun = getattr(sample, method)
            fn = fun('genomewide')
        else:
            fn = method(sample.patient, samplename, 'genomewide')
        if os.path.isfile(fn):
            md = modification_date(fn)
            mod_dates[(name, 'genomewide', samplename)] = md

            if name is None:
                status = 'OK'

            elif check_contamination_genomewide(sample):
                status = 'CONT'

            else:
                status = check_requisite_genomewide(md,
                                                    name,
                                                    samplename,
                                                    mod_dates,
                                                    require_all=require_all)

        else:
            status = 'MISS'

        # Check the number of reads if requested
        if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3):
            status = str(get_number_reads(fn))

        stati.add(status)
        line = line + ('{:<' + str(cell_len) + '}').format(status)
        print line

    if 'OLD' in stati:
        raise ValueError('OLD status found')
예제 #7
0
def print_info(p, title, name, method, name_requisite=None, VERBOSE=0):
    '''Pretty printer for patient pipeline info'''
    import os, sys
    from hivwholeseq.patients.samples import SamplePat
    from hivwholeseq.utils.mapping import get_number_reads

    mod_dates = p.mod_dates

    # NOTE: this function is used to check both entire patients and single samples
    if isinstance(p, SamplePat):
        sample_iter = [(p.name, p)]
    else:
        sample_iter = p.samples.iterrows()

    fragments = ['F' + str(i + 1) for i in xrange(6)]

    stati = set()
    line = ('{:<' + str(title_len) + '}').format(title + ':')
    print line
    for samplename, sample in sample_iter:
        sample = SamplePat(sample)
        title = sample.name
        line = ('{:<' + str(title_len) + '}').format(title + ':')

        for fragment in fragments:
            if isinstance(method, basestring) and hasattr(sample, method):
                fun = getattr(sample, method)
                fn = fun(fragment)
            else:
                fn = method(sample.patient, samplename, fragment)

            if os.path.isfile(fn):
                md = modification_date(fn)
                mod_dates[(name, fragment, samplename)] = md

                if name_requisite is None:
                    status = 'OK'

                elif ((name_requisite, fragment, samplename) in mod_dates):
                    if md > mod_dates[(name_requisite, fragment, samplename)]:
                        status = 'OK'
                    else:
                        status = 'OLD'
                        print fn, md, mod_dates[(name_requisite, fragment,
                                                 samplename)]

                elif ((name_requisite, fragment) in mod_dates):
                    if md > mod_dates[(name_requisite, fragment)]:
                        status = 'OK'
                    else:
                        status = 'OLD'

                        # NOTE: on Nov 13, 2014 I updated the mod dates of all
                        # references by mistake, without actually changing the
                        # sequences (ironically, probably testing a backup system
                        # for the refs themselves). So if the requisite is a ref
                        # seq and the date is this one, it's OK
                        if ((name_requisite == 'reference') and
                            mod_dates[(name_requisite, fragment)].date() == \
                            datetime.date(2014, 11, 13)):
                            status = 'OK'

                elif 'contaminated' in sample[fragment]:
                    status = 'CONT'

                else:
                    status = 'ERROR'

            else:
                status = 'MISS'

            # Check the number of reads if requested
            if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3):
                status = str(get_number_reads(fn))

            stati.add(status)
            line = line+fragment+': '+\
                ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status)+'  '
        print line

    if 'OLD' in stati:
        raise ValueError('OLD status found')
예제 #8
0
def filter_contamination(bamfilename,
                         bamfilename_out,
                         contseqs,
                         samplename,
                         VERBOSE=0,
                         deltascore_max_self=60,
                         deltascore_max_other=24,
                         maxreads=-1,
                         **kwargs):
    '''Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    '''
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if 'score_match' in kwargs:
        score_match = kwargs['score_match']
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam'

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print 'Scanning reads (' + str(
            get_number_reads(bamfilename) // 2) + ')'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \
             pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write('\x1b[1A')
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1,
                     alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print 'Read is very close to its own consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1,
                         ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(),
                                                 key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print 'Read is closest to its consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                    # The read may come from another consensus (contamination)
                    elif (delta_read <= deltascore_max_other):
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print 'Contaminated read found! Good:', n_good, 'cont:', sum(
                                n_cont.itervalues()), 'sources:', n_cont

                        if VERBOSE >= 3:
                            print 'Read is contaminated by', contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='self',
                                                      name2='read')
                            print ''
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                        if VERBOSE >= 2:
                            print ''

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print 'Read is close to nothing really', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
def get_coallele_counts_from_file(bamfilename, length, qual_min=30,
                                  maxreads=-1, VERBOSE=0,
                                  use_tests=False):
    '''Get counts of join occurence of two alleles'''
    from .utils.mapping import (test_read_pair_exotic_cigars,
                                test_read_pair_exceed_reference)

    if VERBOSE >= 1:
        print 'Getting coallele counts'


    # Precompute conversion tables
    alpha_mapping = {a: alphal.index(a) for a in alphal}
    SANGER_SCORE_OFFSET = ord("!")
    q_mapping = dict()
    for letter in xrange(0, 255):
        q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET
    
    if VERBOSE >= 2:
        print 'Initializing matrix of cocounts'

    # NOTE: we are ignoring fwd/rev and read1/2
    counts = np.zeros((len(alpha), len(alpha), length, length), int)
    posall = np.zeros(1000, dtype=[('pos', int), ('aind', int)])

    if VERBOSE >= 2:
        from hivwholeseq.utils.mapping import get_number_reads
        print 'Scanning read pairs ('+str(get_number_reads(bamfilename) // 2)+')'

    # NOTE: the reads should already be filtered of unmapped stuff at this point
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for ir, reads in enumerate(pair_generator(bamfile)):
            if ir == maxreads:
                if VERBOSE:
                    print 'Max read number reached:', maxreads
                break
        
            if (VERBOSE >= 2) and (not ((ir +1) % 10)):
                if (VERBOSE == 2) and (ir + 1 != 10):
                    sys.stdout.write("\x1b[1A")
                print (ir+1) 

            if use_tests:
                if test_read_pair_exotic_cigars(reads):
                    raise ValueError('CIGAR type '+str(bt)+' not recognized')

                if test_read_pair_exceed_reference(reads, length):
                    raise ValueError('Read pair exceeds reference length of '+str(length))


            # Temp structures
            posall[:] = (-1, -1)
            iall = 0

            # Collect alleles
            for read in reads:
                alleles_ind = np.fromiter((alpha_mapping[x] for x in read.seq),
                                          np.uint8, len(read.seq))
                allqual_ind = np.fromiter((q_mapping[x] for x in read.qual),
                                          np.uint8, len(read.qual))

                pos_ref = read.pos
                pos_read = 0
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl
                    elif bt == 2:
                        # NOTE: no CIGAR can start NOR end with a deletion
                        qual_deletion = allqual_ind[pos_read: pos_read + 2].min()
                        if qual_deletion >= qual_min:
                            posall['pos'][iall: iall + bl] = np.arange(pos_ref,
                                                                       pos_ref + bl)
                            posall['aind'][iall: iall + bl] = 4
                            iall += bl
                        pos_ref += bl
                    else:
                        alleles_indb = alleles_ind[pos_read: pos_read + bl]
                        allqual_indb = allqual_ind[pos_read: pos_read + bl]
                        for i in xrange(len(alpha)):
                            aitmp = (alleles_indb == i) & (allqual_indb >= qual_min)
                            aitmp = aitmp.nonzero()[0] + pos_ref
                            aitmplen = len(aitmp)
                            posall['pos'][iall: iall + aitmplen] = aitmp
                            posall['aind'][iall: iall + aitmplen] = i
                            iall += aitmplen

                        pos_read += bl
                        pos_ref += bl

            # Avoid doubles (paired reads are twice the same biological molecule)
            posall.sort(order=('pos', 'aind'))
            iall = (posall['pos'] != -1).nonzero()[0][0]
            while iall < len(posall) - 1:
                if posall['pos'][iall + 1] == posall['pos'][iall]:

                    # If both reads agree @ an overlap allele, take a single call
                    if posall['aind'][iall + 1] == posall['aind'][iall]:
                        posall[iall + 1] = (-1, -1)

                    # else, pick one at random (FIXME: pick the highest phred)
                    else:
                        ibin = np.random.randint(2)
                        posall[iall + ibin] = (-1, -1)

                    iall += 1
                iall += 1

            # Add allele cocounts to the matrix
            # NOTE: this already takes care of the symmetry
            poss = [posall['pos'][posall['aind'] == i1] for i1 in xrange(len(alpha))]
            for i1 in xrange(len(alpha)):
                poss1 = poss[i1]
                if not len(poss1):
                    continue
                for i2 in xrange(len(alpha)):
                    poss2 = poss[i2]
                    if not len(poss2):
                        continue

                    # Raveling vodoo for efficiency
                    cobra = counts[i1, i2].ravel()
                    ind = poss1.repeat(len(poss2)) * length + np.tile(poss2, len(poss1))
                    cobra[ind] += 1

    return counts
def check_premap(data_folder, adaID, fragments, seq_run, samplename,
                 qual_min=30, match_len_min=10,
                 maxreads=-1, VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline() #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'


        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T

    else:
        frags_pos = None
    
    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                             len(refseq),
                                                             qual_min=qual_min,
                                                             match_len_min=match_len_min,
                                                             maxreads=maxreads,
                                                             VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title=', '.join(['run '+seq_run+' '+adaID,
                         'sample '+samplename,
                         'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads),
                        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png')

    return (counts, inserts)
def print_info_genomewide(p, title, name, method, VERBOSE=0, require_all=True):
    '''Pretty printer for patient pipeline info'''

    mod_dates = p.mod_dates

    def check_requisite_genomewide(md, name_requisite, samplename, mod_dates,
                                   require_all=True):
        '''Check requisites for genomewide observables'''
        stati = []
        fragments=['F'+str(i+1) for i in xrange(6)]
        for fragment in fragments:
            if (name_requisite, fragment, samplename) not in mod_dates:
                stati.append('MISS')
            elif md < mod_dates[(name_requisite, fragment, samplename)]:
                stati.append('OLD')
            else:
                stati.append('OK')

        if 'OLD' in stati:
            return 'OLD'
        else:
            if require_all:
                if 'MISS' in stati:
                    return 'MISS'
                else:
                    return 'OK'
            else:
                if 'OK' in stati:
                    return 'OK'
                else:
                    return 'MISS'

    def check_contamination_genomewide(sample):
        '''Check whether any of the fragment samples is contaminated'''
        fragments=['F'+str(i+1) for i in xrange(6)]
        for fragment in fragments:
            if 'contaminated' in sample[fragment]:
                return True
        return False

    import os, sys
    from hivwholeseq.patients.samples import SamplePat

    # NOTE: this function is used to check both entire patients and single samples
    if isinstance(p, SamplePat):
        sample_iter = [(p.name, p)]
    else:
        sample_iter = p.samples.iterrows()

    stati = set()    
    line = ('{:<'+str(title_len)+'}').format(title+':')
    print line
    for samplename, sample in sample_iter:
        sample = SamplePat(sample)
        title = sample.name
        line = ('{:<'+str(title_len)+'}').format(title+':')
        
        if isinstance(method, basestring) and hasattr(sample, method):
            fun = getattr(sample, method)
            fn = fun('genomewide')
        else:
            fn = method(sample.patient, samplename, 'genomewide')
        if os.path.isfile(fn):
            md = modification_date(fn)
            mod_dates[(name, 'genomewide', samplename)] = md

            if name is None:
                status = 'OK'

            elif check_contamination_genomewide(sample):
                status = 'CONT'

            else:
                status = check_requisite_genomewide(md, name, samplename, mod_dates,
                                                    require_all=require_all)

        else:
            status = 'MISS'

        # Check the number of reads if requested
        if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3):
            status = str(get_number_reads(fn))

        stati.add(status)
        line = line + ('{:<'+str(cell_len)+'}').format(status)
        print line

    if 'OLD' in stati:
        raise ValueError('OLD status found') 
def print_info(p, title, name, method, name_requisite=None, VERBOSE=0):
    '''Pretty printer for patient pipeline info'''
    import os, sys
    from hivwholeseq.patients.samples import SamplePat
    from hivwholeseq.utils.mapping import get_number_reads

    mod_dates = p.mod_dates

    # NOTE: this function is used to check both entire patients and single samples
    if isinstance(p, SamplePat):
        sample_iter = [(p.name, p)]
    else:
        sample_iter = p.samples.iterrows()

    fragments=['F'+str(i+1) for i in xrange(6)]

    stati = set()
    line = ('{:<'+str(title_len)+'}').format(title+':')
    print line
    for samplename, sample in sample_iter:
        sample = SamplePat(sample)
        title = sample.name
        line = ('{:<'+str(title_len)+'}').format(title+':')
        
        for fragment in fragments:
            if isinstance(method, basestring) and hasattr(sample, method):
                fun = getattr(sample, method)
                fn = fun(fragment)
            else:
                fn = method(sample.patient, samplename, fragment)

            if os.path.isfile(fn):
                md = modification_date(fn)
                mod_dates[(name, fragment, samplename)] = md

                if name_requisite is None:
                    status = 'OK'

                elif ((name_requisite, fragment, samplename) in mod_dates):
                    if md > mod_dates[(name_requisite, fragment, samplename)]:
                        status = 'OK'
                    else:
                        status = 'OLD'
                        print fn, md, mod_dates[(name_requisite, fragment, samplename)]

                elif ((name_requisite, fragment) in mod_dates):
                    if md > mod_dates[(name_requisite, fragment)]:
                        status = 'OK'
                    else:
                        status = 'OLD'

                        # NOTE: on Nov 13, 2014 I updated the mod dates of all
                        # references by mistake, without actually changing the
                        # sequences (ironically, probably testing a backup system
                        # for the refs themselves). So if the requisite is a ref
                        # seq and the date is this one, it's OK
                        if ((name_requisite == 'reference') and
                            mod_dates[(name_requisite, fragment)].date() == \
                            datetime.date(2014, 11, 13)):
                            status = 'OK'


                elif 'contaminated' in sample[fragment]:
                    status = 'CONT'
                
                else:
                    status = 'ERROR'

            else:
                status = 'MISS'

            # Check the number of reads if requested
            if (status == 'OK') and (fn[-3:] == 'bam') and (VERBOSE >= 3):
                status = str(get_number_reads(fn))

            stati.add(status)
            line = line+fragment+': '+\
                ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status)+'  '
        print line


    if 'OLD' in stati:
        raise ValueError('OLD status found') 
예제 #13
0
def check_premap(data_folder,
                 adaID,
                 fragments,
                 seq_run,
                 samplename,
                 qual_min=30,
                 match_len_min=10,
                 maxreads=-1,
                 VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID),
                        'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline()  #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(
                        fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(
                        fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'

        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments],
                             int).T

    else:
        frags_pos = None

    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename,
        len(refseq),
        qual_min=qual_min,
        match_len_min=match_len_min,
        maxreads=maxreads,
        VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title = ', '.join([
            'run ' + seq_run + ' ' + adaID,
            'sample ' + samplename,
            'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads),
        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder + foldername_adapter(adaID) +
                    'figures/coverage_premapped_' + samplename + '.png')

    return (counts, inserts)