Exemplo n.º 1
0
def complex_enough(seq):
    complexity = lcc_simp(seq.upper())
    if complexity < 1.25:
        return False
    gc = GC(seq.upper())
    if gc < 15 or gc > 95:
        return False
    return True
Exemplo n.º 2
0
 def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid,
                   exp_simple_LCC, exp_window_LCC):
     for s in [seq_str,
               Seq(seq_str, single_letter_alphabet),
               MutableSeq(seq_str, single_letter_alphabet)]:
         self.assertEqual(exp_crc32, u_crc32(s))
         self.assertEqual(exp_crc64, crc64(s))
         self.assertEqual(exp_gcg, gcg(s))
         self.assertEqual(exp_seguid, seguid(s))
         self.assertAlmostEqual(exp_simple_LCC, lcc_simp(s), places=2)
         values = lcc_mult(s, 20)
         self.assertEqual(len(exp_window_LCC), len(values))
         for value1, value2 in zip(exp_window_LCC, values):
             self.assertAlmostEqual(value1, value2, places=2)
def biochemical_properties(sequence: str) -> Dict[str, Any]:
    # Define objects used for calculations
    analysis_object = ProteinAnalysis(sequence)
    descriptor_object = PyPro.GetProDes(sequence)
    sequence_object = Seq(sequence)
    # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values
    # For example, it says the percent composition of every amino acid is zero when I run
    # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm')
    return {
        'Isoelectric point': analysis_object.isoelectric_point(),
        'Molecular weight':
        analysis_object.molecular_weight(),  # Daltons? Amu? g/mol?
        'Aromaticity': analysis_object.aromaticity(),
        'Instability index': analysis_object.instability_index(),
        'GRAVY': analysis_object.gravy(),
        'H-bonding percent': h_bonding_percent(sequence),
        'Melting temp': melting_temp(sequence),
        'LCC': lcc.lcc_simp(sequence)
    }
Exemplo n.º 4
0
 def complexity(seq):
     return lcc_simp(seq)
Exemplo n.º 5
0
 def simple_LCC(s):
     return "%0.2f" % lcc_simp(s)
Exemplo n.º 6
0
 def simple_LCC(s) :
     return "%0.2f" % lcc_simp(s)
Exemplo n.º 7
0
def simple_LCC(s):
    #Avoid cross platforms with printing floats by doing conversion explicitly
    return "%0.2f" % lcc_simp(s)
Exemplo n.º 8
0
def simple_LCC(s):
    #Avoid cross platforms with printing floats by doing conversion explicitly
    return "%0.2f" % lcc_simp(s)
Exemplo n.º 9
0
def findIR(q, args,l_lock, candidates, perc_seq, last_perc_seq):
    while True:
        try:
            seq, seq_fs, split_index, record_id, seq_len, count = q.get(timeout=15)
        except queue.Empty:
            break
        splited_len = len(seq)
        seq_rc = str(Seq(seq).reverse_complement())
        complexity = lcc_simp(seq.upper())
        if not complex_enough(seq):
            q.task_done()
            continue
        record_q = SeqRecord(Seq(seq), id = record_id)
        record_s = SeqRecord(Seq(seq_rc), id = record_id + "_rc")
        query_filename = "results/" + args.jobname + "/tmp/query" + str(record_id + "_" + str(split_index))+".tmp"
        subject_filename = "results/" + args.jobname + "/tmp/subject" + str(record_id + "_" + str(split_index))+".tmp"
        SeqIO.write(record_q, query_filename, "fasta")
        SeqIO.write(record_s, subject_filename, "fasta")
        cmd_list = [
        'blastn',
        '-query',query_filename,
        '-subject',subject_filename,
        '-reward','2',
        #'-max_target_seqs','1',
        '-penalty','-4',
        '-word_size','4',
        #'-ungapped',
        #'-evalue','140',
        '-strand','plus',
        #'-soft_masking','false',
        #'-dust','no',
        '-outfmt',"6 sstart send qstart qend score length mismatch gaps gapopen nident"]
        p = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
        out,err = p.communicate()
        if err:
            print(split_index, record_id,seq_len)
            makelog("BLASTN error: %s" % (err, ) )
        os.remove(query_filename)
        os.remove(subject_filename)
        lines = out.splitlines()
        for row in lines:
            row = row.split()
            sstart = int(row[0])
            send = int(row[1])
            qstart = int(row[2])
            qend = int(row[3])
            score = int(row[4])
            length = int(row[5])
            mismatch = int(row[6])
            gaps = int(row[7])

            #filter valids IR
            if length < args.align_min_len:
                continue

            #subject transform cause it was reversed
            sstart = splited_len - sstart 
            send = splited_len - send

            #obtain IR sequences
            seq_q = seq[qstart:qend]
            seq_q_prime = seq[send:sstart]

            qstart, qend = min(qstart, qend),max(qstart, qend)
            sstart, send = min(sstart, send),max(sstart, send)

            #organice positions
            ir_start = min(qstart,qend,sstart,send)
            ir_end = max(qstart,qend,sstart,send)
            
            #calculate length
            ir_len = ir_end - ir_start

            #length constraints
            if ir_len > args.mite_max_len:
                continue
            if ir_len < args.mite_min_len:
                continue
            #move in genome, split index
            #ir_seq = seq[ir_start:ir_end]

            #again validate complexity, a value of 1 means only two different nucleotides are present
            if not complex_enough(seq_q):
                continue

            if not complex_enough(seq_q_prime):
                continue

            #validate TSD outside TIRs
            i = args.tsd_max_len
            valid_tsd = False
            while i >= args.tsd_min_len:
                tsd_one = seq_fs[ir_start - i + args.FSL:ir_start + args.FSL]
                tsd_two = seq_fs[ir_end + args.FSL:ir_end + i + args.FSL]
                if tsd_one.lower() == tsd_two.lower():
                    valid_tsd = True
                    mite_pos_one = ir_start + args.FSL
                    mite_pos_two = ir_end + args.FSL
                    tsd_in = 'no'
                    break
                i -= 1

            #validate TSD inside TIRs
            #TSDs cannot be a large part of TIRs
            if not valid_tsd:
                i = args.tsd_max_len
                while i >= args.tsd_min_len:
                    tsd_one = seq_fs[ir_start + args.FSL:ir_start + i + args.FSL]
                    tsd_two = seq_fs[ir_end - i + args.FSL:ir_end + args.FSL]
                    if tsd_one.lower() == tsd_two.lower():
                        valid_tsd = True
                        mite_pos_one = ir_start + args.FSL + i
                        mite_pos_two = ir_end  + args.FSL - i
                        tsd_in = 'yes'
                        break
                    i -= 1
            #"no tsd"
            if not valid_tsd:
                continue

            ir_seq = seq_fs[mite_pos_one:mite_pos_two]
            if not complex_enough(ir_seq):
                continue
            #ir_seq = seq_fs[mite_pos_one - args.FSL:mite_pos_two + args.FSL]
            ir_len = mite_pos_two - mite_pos_one

            fs_start = max(0,mite_pos_one - args.FSL)
            flanking_seq_left = seq_fs[fs_start:mite_pos_one]
            flanking_seq_right = seq_fs[mite_pos_two:mite_pos_two + args.FSL]

            if len(flanking_seq_right) < args.FSL or len(flanking_seq_left) < args.FSL:
                continue

            #calculate positions in full sequence
            mite_start_full = mite_pos_one + split_index - args.FSL 
            mite_end_full = mite_pos_two + split_index - args.FSL
            
            #new_element = (mite_start_full, mite_end_full, ir_seq, record_id, ir_len, seq_q, seq_q_prime, tsd_one, tsd_in,flanking_seq_left,flanking_seq_right,length,'','unfiltered','')
            new_element = {
                'start': mite_start_full,
                'end': mite_end_full, 
                'end': mite_end_full, 
                'seq': ir_seq, 
                'record': record_id, 
                'mite_len': ir_len, 
                'tir1_start': mite_start_full,
                'tir1_end': mite_start_full + length,
                'tir2_start': mite_end_full - length,
                'tir2_end': mite_end_full,
                'tir1_seq': seq_q, 
                'tir2_seq': seq_q_prime,
                'tsd': tsd_one,
                'tsd_in': tsd_in,
                'fs_left': flanking_seq_left,
                'fs_right': flanking_seq_right,
                'tir_len': length,
            }
            with l_lock:
                index = "%s_%i" % (record_id, (count)) 
                #we don't want overlapped TIRs, save the broader
                is_nested = False
                has_nested = False
                for curr_count in range(count - 1, count + 2):
                    curr_index = "%s_%i" % (record_id, (curr_count)) 
                    if curr_index in candidates:
                        for candidate in candidates[curr_index]:
                            #if new element TIR is nested in other TIR
                            if new_element['start'] >= candidate['tir1_start'] and \
                                new_element['start'] <= candidate['tir1_end'] and \
                                new_element['end'] >= candidate['tir2_start'] and \
                                new_element['end'] <= candidate['tir2_end'] and \
                                new_element['record'] == candidate['record']:
                                is_nested = True
                            #if other element is nested inside new element TIR
                            if candidate['start'] >= new_element['tir1_start'] and \
                                candidate['start'] <= new_element['tir1_end'] and \
                                candidate['end'] >= new_element['tir2_start'] and \
                                candidate['end'] <= new_element['tir2_end'] and \
                                candidate['record'] == candidate['record']:
                                has_nested = True
                                candidates[curr_index].remove(candidate)
                if has_nested or not is_nested:
                    if not index in candidates:
                        candidates[index] = []
                    candidates[index].append(new_element)
                        
            curr_perc = int(split_index * 100 / seq_len)

            if not record_id in perc_seq or not record_id in last_perc_seq:
                perc_seq[record_id] = curr_perc
                last_perc_seq[record_id] = curr_perc
                makelog(record_id + " " + str(curr_perc) + "%")

            if perc_seq[record_id] - last_perc_seq[record_id] >= 10:
                makelog(record_id + " " + str(curr_perc) + "%")
                last_perc_seq[record_id] = curr_perc

            perc_seq[record_id] = curr_perc

        q.task_done()
Exemplo n.º 10
0
 def test_lcc_simp(self):
     s1 = "ACGATAGC"
     self.assertAlmostEqual(lcc_simp(s1), 0.9528, places=4)
Exemplo n.º 11
0
#!/usr/bin/env python3

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('fasta', type=argparse.FileType('r'))
args = parser.parse_args()

from Bio import SeqIO
from Bio.SeqUtils import lcc

for record in SeqIO.parse(args.fasta, 'fasta'):
    complexity = lcc.lcc_simp(record.seq)
    print("{}\t{}".format(record.id, complexity))
Exemplo n.º 12
0
import sys
from Bio import AlignIO
from Bio import SeqIO
from Bio.SeqUtils import GC
import numpy as np
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult

#infile=sys.argv[1]

infile = 'goodalignments.txt'

with open(infile) as f:
    reflist = f.read().splitlines()

f.close()

for i in reflist:
    myfile = open(i, 'r')
    mydata = myfile.read()
    fasta = AlignIO.read(i, "fasta")
    myfile.close()
    lcc_values_lv = lcc_simp(fasta[0].seq)
    lcc_values_He = lcc_simp(fasta[1].seq)
    lcc_values_Ht = lcc_simp(fasta[2].seq)
    LCC_content = open('LCC_content.tab', 'a')
    LCC_content.write('%s\t%.2f\t%.2f\t%.2f\n' %
                      (i, lcc_values_lv, lcc_values_He, lcc_values_Ht))