Пример #1
0
    def createPrimers(self,db,bowtie='bowtie2', delete=True, tags={}, tmThreshold=50.0, endMatch=6, maxAln=20):
        # run bowtie (max 1000 alignments, allow for one gap/mismatch?)
        mapfile = self.file+'.sam'
        if not os.path.exists(mapfile):
            proc = subprocess.check_call( \
                [bowtie, '-f', '--end-to-end', '-p 2', \
                '-k '+str(maxAln), '-L 10', '-N 1', '-D 20', '-R 3', \
                '-x', db, '-U', self.file, '>', mapfile ])
        # Read fasta file (Create Primer)
        primers = {}
        with pysam.FastaFile(self.file) as fasta:
            for s in fasta.references:
                # parse target locus from fasta file
                try:
                    primername, targetposition = s.split('|')
                    reTargetposition = re.match(r'(\w+):(\d+)-(\d+):([+-])',targetposition)
                except:
                    primername = s
                    targetLocus = None
                else:
                    # create stranded targetlocus
                    reverse = True if reTargetposition.group(4)=='-' else False
                    tm = primer3.calcTm(fasta.fetch(s))  # assume targetlocus is full match
                    targetLocus = Locus(reTargetposition.group(1), int(reTargetposition.group(2)), int(reTargetposition.group(3))-int(reTargetposition.group(2)), reverse, tm)
                # create primer (with target locus)
                primertag = tags[primername] if primername in tags.keys() else None
                primers[primername] = Primer(primername,fasta.fetch(s),targetLocus,tag=primertag)

        # read SAM OUTPUT and filter alignments
        mappings = pysam.Samfile(mapfile,'r')
        alnCount = Counter()  # count alignments to kill locations of non-specific primers (count == -k)
        for aln in mappings:
            primername = aln.qname.split('|')[0]
            if aln.is_unmapped:
                continue
            else:
                alnCount[primername] += 1
            ## get reference sequence
            qry = aln.query_sequence.upper()
            ref = aln.get_reference_sequence().upper()
            refrc = ref.translate(revcmp)[::-1]
            aln_tm = primer3.calcHeterodimerTm(qry,refrc)
            # TmThreshold and mimatches in 3'end check
            if aln_tm > tmThreshold:
                if len(qry)>endMatch and len(ref)>endMatch:
                    if len([ x for x in zip(qry[-endMatch:], ref[-endMatch:]) if x[0]!=x[1] ]) == 0:
                        primers[primername].addTarget(mappings.getrname(aln.reference_id), aln.pos, aln.is_reverse, aln_tm)
        # remove primer locations for those that have hit maximum
        for k, v in primers.items():
            if len(v.loci) >= maxAln:
                v.loci = []
        # cleanup
        if delete:
            os.unlink(self.file+'.sam') # delete mapping FILE
        return primers.values()
Пример #2
0
 def _fwdStrand():
     fwd_hamming_distances = seqstr.rollingHammingDistance(primer_str,
                                                           genome_rc_str)
     fwd_hd_thresh = np.percentile(fwd_hamming_distances, hamming_percentile)
     fwd_primer_footprint = (-(primer_idx+primer_length), (-primer_idx))
     fwd_hamming_distances[fwd_primer_footprint[0]: \
                           fwd_primer_footprint[1]] = primer_length
     fwd_hotspots, = np.where((fwd_hamming_distances < fwd_hd_thresh))
     highest_tm_idx = None
     highest_tm = -100
     for idx in fwd_hotspots:
         tm = primer3.calcHeterodimerTm(
             primer_str, genome_str[-(idx+primer_length):-idx],
             **params['thermo_params'])
         if tm > highest_tm:
             highest_tm_idx = idx
             highest_tm = tm
     strand_results.put((highest_tm, highest_tm_idx, 1))
Пример #3
0
    def _revStrand():
        rev_hamming_distances = seqstr.rollingHammingDistance(primer_str,
                                                              genome_str)
        rev_hd_thresh = np.percentile(rev_hamming_distances, hamming_percentile)
        rev_primer_footprint = ((primer_idx), (primer_idx+primer_length))
        rev_hamming_distances[rev_primer_footprint[0]: \
                              rev_primer_footprint[1]] = primer_length
        rev_hotspots, = np.where((rev_hamming_distances < rev_hd_thresh))

        highest_tm_idx = None
        highest_tm = -100
        for idx in rev_hotspots:
            tm = primer3.calcHeterodimerTm(
                primer_str, genome_rc_str[idx:idx+primer_length],
                **params['thermo_params'])
            if tm > highest_tm:
                highest_tm_idx = idx
                highest_tm = tm
        strand_results.put((highest_tm, highest_tm_idx, 0))
Пример #4
0
def screenPadlockArms(  p_l_seq: str,
                        p_r_seq: str,
                        loop_seq: str,
                        p_params: dict,
                        do_print: bool = False) -> Tuple[bool, dict]:
    is_good = True
    tp = p_params['thermo_params']
    report = {
        'arm_gc_min_l': 0,
        'arm_gc_max_l': 0,
        'arm_gc_min_r': 0,
        'arm_gc_max_r': 0,
        'l_clamp': True,
        'tm_arm_min_l': 0,
        'tm_arm_min_r': 0,
        'ex_seq': [],
        'tm_hairpin_l': 0,
        'tm_hairpin_r': 0,
        'tm_hetero_0': 0,
        'tm_hetero_1': 0,
        'tm_hetero_2': 0
    }

    "1. GC content checks"
    p_l_gc_content = gcContent(p_l_seq)
    p_r_gc_content = gcContent(p_r_seq)
    if p_l_gc_content < p_params['arm_gc_min']:
        if do_print:
            print("\tgc content L min fail %0.3f" % p_l_gc_content)
        is_good = False
    report['arm_gc_min_l'] = p_l_gc_content
    if p_r_gc_content < p_params['arm_gc_min']:
        if do_print:
            print("\tgc content R min fail %0.3f" % p_r_gc_content)
        is_good = False
    report['arm_gc_min_r'] = p_r_gc_content
    if p_l_gc_content > p_params['arm_gc_max']:
        if do_print:
            print("\tgc content L max fail %0.3f" % p_l_gc_content)
        is_good = False
    report['arm_gc_max_l'] = p_l_gc_content
    if p_r_gc_content > p_params['arm_gc_max']:
        if do_print:
            print("\tgc content R max fail %0.3f" % p_r_gc_content)
        is_good = False
    report['arm_gc_max_r'] = p_r_gc_content


    "2. GC clamp checks"
    l_3p_check = padlockLeftArmGCClamp(p_l_seq)
    if l_3p_check > 3:
        if do_print:
            print("\tl clamp fail")
        is_good = False
    report['l_clamp'] = False

    "3. Arm Tm check"
    p_arm_tm_l = calcTm(p_l_seq, **tp)
    p_arm_tm_r = calcTm(p_r_seq, **tp)
    if p_arm_tm_l < p_params['arm_tm_min']:
        if do_print:
            print("\tArm L fail %2.3f" % p_arm_tm_l)
        is_good = False
    report['tm_arm_min_l'] = p_arm_tm_l
    if p_arm_tm_r < p_params['arm_tm_min']:
        if do_print:
            print("\tArm R fail %2.3f" % p_arm_tm_r)
        is_good = False
    report['tm_arm_min_r'] = p_arm_tm_r

    p_seq = (
        p_r_seq + loop_seq + p_l_seq
    )
    "4. Check for excluded seqs"
    ex_fail = False
    for ex_seq in p_params['exclude_seqs']:
        if ex_seq in p_seq:
            ex_fail = True
            report['ex_seq'].append(ex_seq)
            break
    if ex_fail:
        is_good = False

    "5. Secondary structure / primer dimer checks"
    p_het_tm_0 = calcHeterodimerTm(p_l_seq, p_r_seq, **tp)
    p_het_tm_1 = calcHeterodimerTm(p_l_seq, loop_seq, **tp)
    p_het_tm_2 = calcHeterodimerTm(p_r_seq, loop_seq, **tp)
    if p_het_tm_0 > p_params['structure_tm_max']:
        if do_print:
            print("\thetero 0 fail")
        is_good = False
    report['tm_hetero_0'] = p_het_tm_0
    if p_het_tm_1 > p_params['structure_tm_max']:
        if do_print:
            print("\thetero 1 fail")
        is_good = False
    report['tm_hetero_1'] = p_het_tm_1
    if p_het_tm_2 > p_params['structure_tm_max']:
        if do_print:
            print("\thetero 2 fail")
        is_good = False
    report['tm_hetero_2'] = p_het_tm_2
    return is_good, report