def __init__(self, prot_sequence, monoisotopic=False): if prot_sequence.islower(): self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) else: self.sequence = Seq(prot_sequence, IUPAC.protein) self.amino_acids_content = None self.amino_acids_percent = None self.length = len(self.sequence) self.monoisotopic = monoisotopic
def run_minimus(fasta, outroot=None, restore_singletons=True, contig_prefix='', qual=None): '''given a fasta file and an optional output root (otherwise use fasta base) generates an assembly using minimus from the amos package assembly saved as <outroot>.minimus.fasta optionally restores singleton reads in <outroot>.all.fasta if restore_singletons=True, returns path to .all.fasta, otherwise returns path to minimus.fasta''' if outroot is None: outroot = fasta.rsplit('.', 1)[0] if qual: print >> sys.stderr, 'qualities invoked (%s)' % qual os.system('toAmos -s %s -q %s -o %s.afg' % (fasta, qual, outroot)) else: os.system('toAmos -s %s -o %s.afg' % (fasta, outroot)) os.system('minimus -D TGT=%s.afg %s.minimus' % (outroot, outroot)) if contig_prefix: lines = open(outroot + '.minimus.contig').readlines() fh = open(outroot + '.minimus.contig', 'w') for l in lines: if l.startswith('##'): print >> fh, '##' + contig_prefix + l[2:], else: print >> fh, l, fh.close() if restore_singletons: in_assem = re.findall('#(.+?)\(', open(outroot + '.minimus.contig').read()) reads = Seq.Fasta(fasta) for f in in_assem: del reads[f] all_fasta = outroot + '.all.fasta' assem = Seq.Fasta(outroot + '.minimus.fasta') allseq = Seq.Fasta() allseq.update(dict([(contig_prefix + k, v) for k, v in assem.items()])) allseq.update(reads) allseq.write_to_file(all_fasta) return all_fasta else: if contig_prefix: f = outroot + '.minimus.fasta' lines = open(f).readlines() fh = open(f, 'w') for l in lines: if l.startswith('>'): print >> fh, '>' + contig_prefix + l[1:], else: print >> fh, l, fh.close() return outroot + '.minimus.fasta'
def trim_align(seqlist, align_data): """Assumes align_data is keyed by seq.getName(). Returns a new list of Seq objects, trimmed according to al_start and al_stop. Reverse-complements the sequence if necessary according to the orientation in the input alignment. """ trimmed_seqs = [] for seq in seqlist: name = seq.getName() if not align_data.has_key(name): log.info('the sequence %(name)s was not found in the alignment data' % locals()) continue these_results = align_data[name] start = these_results['al_start'] stop = these_results['al_stop'] substr = seq[start - 1:stop] log.debug('name: %(name)s start: %(start)s stop: %(stop)s' % locals()) if these_results['fa_frame'] == 'r': substr = Seq.reverse_complement(substr) log.debug('seq %s frame=%s, reverse complementing seq:\n%s' % (name, these_results['fa_frame'], substr)) newseq = copy.deepcopy(seq) newseq.setSeq(substr) trimmed_seqs.append(newseq) return trimmed_seqs
def test_find_exec(self): cmd = "ls" path = Seq.find_exec(cmd) self.assertTrue(path) if path: out = subprocess.call([path], stdout=open(os.devnull, "w")) self.assertTrue(out == 0)
def ungap(seq): """given a sequence with gap encoding, return the ungapped sequence""" #TODO - Fix this? It currently assumes the outmost AlphabetEncoder #is for the gap. Consider HasStopCodon(Gapped(Protein())) as a test case. gap = seq.gap_char letters = [] for c in seq.data: if c != gap: letters.append(c) return Seq.Seq("".join(letters), seq.alphabet.alphabet)
def main(): qseq = 'TATACTTT' sseq = Seq.complement(qseq) qseq = 'GGACTGACG' sseq = 'CCTGGCTGC' mono = 50 diva = 1.5 oligo = 50 dntp = 0.25 seq = Cal(qseq, sseq, mono_conc=50, diva_conc=1.5, oligo_conc=50, dntp_conc=0.25) print 'Tm: ', seq.Tm print 'DeltaG: ', seq.DeltaG
def reduce_sequence(seq, reduction_table, new_alphabet=None): """ given an amino-acid sequence, return it in reduced alphabet form based on the letter-translation table passed. Some "standard" tables are in Alphabet.Reduced. seq: a Seq.Seq type sequence reduction_table: a dictionary whose keys are the "from" alphabet, and values are the "to" alphabet""" if new_alphabet is None: new_alphabet = Alphabet.single_letter_alphabet new_alphabet.letters = '' for letter in reduction_table: new_alphabet.letters += letter new_alphabet.size = len(new_alphabet.letters) new_seq = Seq.Seq('', new_alphabet) for letter in seq: new_seq += reduction_table[letter] return new_seq
import sys import os import unittest import logging import config import Seq log = logging module_name = os.path.split(sys.argv[0])[1].rstrip('.py') outputdir = config.outputdir datadir = config.datadir clustalw_path = Seq.find_exec('clustalw') class TestClustalwInstalled(unittest.TestCase): def test1(self): if not clustalw_path: log.error('clustalw could not be found - skipping tests in this module') if clustalw_path is not None: class TestRunClustalw(unittest.TestCase): def setUp(self): self.file1 = os.path.join(datadir, '10patients.fasta') self.funcname = '_'.join(self.id().split('.')[-2:]) self.outfile = os.path.join(outputdir,self.funcname) def test1(self):
csv_writer = writer(write_obj) csv_writer.writerow(rowy) return [frame, rowx] def nothing(val): pass # # # cv2.namedWindow('image') cv2.createTrackbar('threshold', 'image', 42, 255, nothing) vs = VideoCapture.MyVideoCapture() sp = seqpose.SEQP() et = EyeTracker(vid=vs, seqp=sp) s = Seq.SEQ() while True: thresh_val = cv2.getTrackbarPos('threshold', 'image') et.pupil_thresh = thresh_val # print(pyautogui.position()) save = win32api.GetAsyncKeyState(0x20) # start_time = time.time() f = et.mainloop(save) if len(f) != 0: print(np.shape(f[1])) p = s.predict(np.array([f[1]])) x = p[0][0] * 1919 y = p[0][1] * 1079 pyautogui.moveTo(x, y) print("model ", x, y) # print("real ",pyautogui.position().x,pyautogui.position().y)
import os class RfamSearch(): def __init__(self): pass def cmscan(self, seq): print seq # make tmp file f = open('/tmp/ss.fa', 'w') f.write('>test\n') f.write(seq.seq) f.close() old_pwd = os.getcwd() os.chdir('/home/magnus/work/rfamdb') cmd = 'cmscan -E 1 Rfam.cm /tmp/ss.fa > /tmp/cmscan.txt' subprocess.Popen(cmd, shell=True) self.output = open('/tmp/cmscan.txt').read() os.chdir(old_pwd) return self.output #main if __name__ == '__main__': import Seq seq = Seq.Seq("GGCGCGGCACCGUCCGCGGAACAAACGG") rs = RfamSearch() rs.cmscan(seq)
import sys import os import unittest import logging import config import Seq log = logging module_name = os.path.split(sys.argv[0])[1].rstrip('.py') outputdir = config.outputdir datadir = config.datadir hmmbuild_path = Seq.find_exec('hmmbuild') class TestHmmerInstalled(unittest.TestCase): def test1(self): if hmmbuild_path is None: log.error('hmmer software could not be found - skipping tests in this module') if hmmbuild_path is not None: class TestRunHmmer(unittest.TestCase): def setUp(self): self.file1 = os.path.join(datadir, 's_trimmed.aln') self.funcname = '_'.join(self.id().split('.')[-2:]) self.outfile = os.path.join(outputdir,self.funcname) def test1(self):
import os import unittest import logging import pprint import config import Seq log = logging module_name = os.path.split(sys.argv[0])[1].rstrip('.py') outputdir = config.outputdir datadir = config.datadir fasta_path = Seq.find_exec('fasta35') class TestFastaInstalled(unittest.TestCase): def test1(self): if fasta_path is None: log.error('fasta35 could not be found - skipping tests in this module') if fasta_path is not None: class TestRunFasta(unittest.TestCase): def setUp(self): self.file1 = os.path.join(datadir, '10patients.fasta') self.funcname = '_'.join(self.id().split('.')[-2:]) self.outfile = os.path.join(outputdir,self.funcname)
class ProteinAnalysis(object): """Class containing methods for protein analysis. The constructor takes two arguments. The first is the protein sequence as a string, which is then converted to a sequence object using the Bio.Seq module. This is done just to make sure the sequence is a protein sequence and not anything else. The second argument is optional. If set to True, the weight of the amino acids will be calculated using their monoisotopic mass (the weight of the most abundant isotopes for each element), instead of the average molecular mass (the averaged weight of all stable isotopes for each element). If set to false (the default value) or left out, the IUPAC average molecular mass will be used for the calculation. """ def __init__(self, prot_sequence, monoisotopic=False): if prot_sequence.islower(): self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) else: self.sequence = Seq(prot_sequence, IUPAC.protein) self.amino_acids_content = None self.amino_acids_percent = None self.length = len(self.sequence) self.monoisotopic = monoisotopic def count_amino_acids(self): """Count standard amino acids, returns a dict. Counts the number times each amino acid is in the protein sequence. Returns a dictionary {AminoAcid:Number}. The return value is cached in self.amino_acids_content. It is not recalculated upon subsequent calls. """ if self.amino_acids_content is None: prot_dic = dict((k, 0) for k in IUPACData.protein_letters) for aa in prot_dic: prot_dic[aa] = self.sequence.count(aa) self.amino_acids_content = prot_dic return self.amino_acids_content def get_amino_acids_percent(self): """Calculate the amino acid content in percentages. The same as count_amino_acids only returns the Number in percentage of entire sequence. Returns a dictionary of {AminoAcid:percentage}. The return value is cached in self.amino_acids_percent. input is the dictionary self.amino_acids_content. output is a dictionary with amino acids as keys. """ if self.amino_acids_percent is None: aa_counts = self.count_amino_acids() percentages = {} for aa in aa_counts: percentages[aa] = aa_counts[aa] / float(self.length) self.amino_acids_percent = percentages return self.amino_acids_percent def molecular_weight(self): """Calculate MW from Protein sequence""" # make local dictionary for speed if self.monoisotopic: water = 18.01 iupac_weights = IUPACData.monoisotopic_protein_weights else: iupac_weights = IUPACData.protein_weights water = 18.02 aa_weights = {} for i in iupac_weights: # remove a molecule of water from the amino acid weight aa_weights[i] = iupac_weights[i] - water total_weight = water # add just one water molecule for the whole sequence for aa in self.sequence: total_weight += aa_weights[aa] return total_weight def aromaticity(self): """Calculate the aromaticity according to Lobry, 1994. Calculates the aromaticity value of a protein according to Lobry, 1994. It is simply the relative frequency of Phe+Trp+Tyr. """ aromatic_aas = 'YWF' aa_percentages = self.get_amino_acids_percent() aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) return aromaticity def instability_index(self): """Calculate the instability index according to Guruprasad et al 1990. Implementation of the method of Guruprasad et al. 1990 to test a protein for stability. Any value above 40 means the protein is unstable (has a short half life). See: Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). """ index = ProtParamData.DIWV score = 0.0 for i in range(self.length - 1): this, next = self.sequence[i:i+2] dipeptide_value = index[this][next] score += dipeptide_value return (10.0 / self.length) * score def flexibility(self): """Calculate the flexibility according to Vihinen, 1994. No argument to change window size because parameters are specific for a window=9. The parameters used are optimized for determining the flexibility. """ flexibilities = ProtParamData.Flex window_size = 9 weights = [0.25, 0.4375, 0.625, 0.8125, 1] scores = [] for i in range(self.length - window_size): subsequence = self.sequence[i:i+window_size] score = 0.0 for j in range(window_size // 2): front = subsequence[j] back = subsequence[window_size - j - 1] score += (flexibilities[front] + flexibilities[back]) * weights[j] middle = subsequence[window_size // 2 + 1] score += flexibilities[middle] scores.append(score / 5.25) return scores def gravy(self): """Calculate the gravy according to Kyte and Doolittle.""" total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) return total_gravy / self.length def _weight_list(self, window, edge): """Makes a list of relative weight of the window edges compared to the window center. The weights are linear. it actually generates half a list. For a window of size 9 and edge 0.4 you get a list of [0.4, 0.55, 0.7, 0.85]. """ unit = 2 * (1.0 - edge) / (window - 1) weights = [0.0] * (window // 2) for i in range(window // 2): weights[i] = edge + unit * i return weights def protein_scale(self, param_dict, window, edge=1.0): """Compute a profile by any amino acid scale. An amino acid scale is defined by a numerical value assigned to each type of amino acid. The most frequently used scales are the hydrophobicity or hydrophilicity scales and the secondary structure conformational parameters scales, but many other scales exist which are based on different chemical and physical properties of the amino acids. You can set several parameters that control the computation of a scale profile, such as the window size and the window edge relative weight value. WindowSize: The window size is the length of the interval to use for the profile computation. For a window size n, we use the i-(n-1)/2 neighboring residues on each side to compute the score for residue i. The score for residue i is the sum of the scaled values for these amino acids, optionally weighted according to their position in the window. Edge: The central amino acid of the window always has a weight of 1. By default, the amino acids at the remaining window positions have the same weight, but you can make the residue at the center of the window have a larger weight than the others by setting the edge value for the residues at the beginning and end of the interval to a value between 0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to view the change along a protein sequence. Many scales exist. Just add your favorites to the ProtParamData modules. Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl """ # generate the weights # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done # in the loop. weights = self._weight_list(window, edge) scores = [] # the score in each Window is divided by the sum of weights # (* 2 + 1) since the weight list is one sided: sum_of_weights = sum(weights) * 2 + 1 for i in range(self.length - window + 1): subsequence = self.sequence[i:i+window] score = 0.0 for j in range(window // 2): # walk from the outside of the Window towards the middle. # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid try: front = param_dict[subsequence[j]] back = param_dict[subsequence[window - j - 1]] score += weights[j] * front + weights[j] * back except KeyError: sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % (subsequence[j], subsequence[window - j - 1])) # Now add the middle value, which always has a weight of 1. middle = subsequence[window // 2] if middle in param_dict: score += param_dict[middle] else: sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) scores.append(score / sum_of_weights) return scores def isoelectric_point(self): """Calculate the isoelectric point. Uses the module IsoelectricPoint to calculate the pI of a protein. """ aa_content = self.count_amino_acids() #dictionary: {AA:number present} ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) return ie_point.pi() def secondary_structure_fraction(self): """Calculate fraction of helix, turn and sheet. Returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet. Amino acids in helix: V, I, Y, F, W, L. Amino acids in Turn: N, P, G, S. Amino acids in sheet: E, M, A, L. Returns a tuple of three integers (Helix, Turn, Sheet). """ aa_percentages = self.get_amino_acids_percent() helix = sum(aa_percentages[r] for r in 'VIYFWL') turn = sum(aa_percentages[r] for r in 'NPGS') sheet = sum(aa_percentages[r] for r in 'EMAL') return helix, turn, sheet
def build_fasta_from_scaff_gff(infasta_s, gff, contig_prefix='', include_singletons=True, ol_minID=0.9, outfile=None, mum_len='4'): '''takes scaffolding information from gff of the form generated by get_scaff_from_minimus builds a single assembly for all scaffold instructions pertaining to seqids in infasta if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly ol_minID is the minimum %ID accepted for overlaps in contigs ''' if isinstance(infasta_s, str): infasta = Seq.Fasta(infasta_s) else: infasta = deepcopy(infasta_s) suffixes = ['', 'b', 'c', 'd', 'e', 'f'] current_suffix = '' #use only scaffolding info relevant to the specified infasta in_ids = infasta.seq_names() this_gff = [r for r in gff if r['seqid'] in in_ids] #use only scaffolding info that joins 2 or more seqs contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff], 0) for r in this_gff: contigs[r['attribute_contig']] += 1 #get final ordered scaffolding layout this_gff = sorted( [r for r in this_gff if contigs[r['attribute_contig']] > 1], key=lambda r: (r['attribute_contig'], int(r['attribute_cstart']), int(r['attribute_cend']))) #extract sequences and orient for scaffolding assem_frags = infasta.substr_from_gff(this_gff, plus_strand=True, name_key=None) assem = Seq.Fasta() for k, v in contigs.items(): if v > 1: assem[contig_prefix + k] = Seq.Sequence('') if this_gff: for i, r in enumerate(this_gff[:-1]): next = this_gff[i + 1] if r['attribute_contig'] == next['attribute_contig']: s1, e1, s2, e2 = [ int(n) for n in [ r['attribute_cstart'], r['attribute_cend'], next['attribute_cstart'], next['attribute_cend'] ] ] print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % ( s1, e1, s2, e2, r, next) if e1 > s2: print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % ( assem_frags[r['seqid']][(s2 - s1):], assem_frags[next['seqid']][:(e1 - s2 + 1)]) fa1 = Seq.Fasta() fa2 = Seq.Fasta() fa1['seq1'] = assem_frags[r['seqid']][(s2 - s1):] fa2['seq2'] = assem_frags[next['seqid']][:(e1 - s2 + 1)] shorter = min(len(fa1['seq1']), len(fa2['seq2'])) mums = Aln.mum(fa1, fa2, mumargs={'-l': '%s' % int(mum_len)})[0] match = float(sum([mumr['score'] for mumr in mums])) if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \ (match/shorter >= ol_minID) or \ (fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \ (Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])): assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']][:s2] else: #implement record of splitting into a/b/etc fragments! print >> sys.stdout, fa1, '\n', fa2, '\n', mums current_suffix = suffixes[ suffixes.index(current_suffix) + 1] print >> sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % ( e1 - s2, match / (e1 - s2), ol_minID, current_suffix) assem[contig_prefix + r['attribute_contig'] + current_suffix] = assem_frags[r['seqid']] else: spacer = Seq.Sequence('n' * (s2 - e1)) assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']] + spacer else: assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']] current_suffix = '' assem[contig_prefix + this_gff[-1]['attribute_contig'] + current_suffix] += assem_frags[this_gff[-1]['seqid']] if include_singletons: singletons = dict([(k, v) for k, v in infasta.items() if not k in [r['seqid'] for r in this_gff]]) assem.update(singletons) if outfile: assem.write_to_file(outfile) return assem
######### ######### Test Seq.py ######### sys.path.append("/Share/home/zhangqf8/lipan/python_utils/PyPsBL") import Seq ##################### # reverse_comp(sequence) ##################### print Seq.reverse_comp("TAGCTAGCTGGTTAGTTCTATC") print Seq.reverse_comp("TAGCTAatgcatTAGTTCTATC") print Seq.reverse_comp("TAGCTAGCT---TAGTTC--TC") print Seq.reverse_comp("TANNNNNNNNGTTAGTTCTATC") ##################### # flat_seq(sequence, lineLen=60) ##################### import General seqFn = "test_seq.fasta" fasta = General.load_fasta(seqFn, rem_tVersion=False) print Seq.flat_seq(fasta['ENST00000580210.5']) print Seq.flat_seq(fasta['ENST00000580210.5'], lineLen=10) print Seq.flat_seq(fasta['ENST00000580210.5'], lineLen=100) print Seq.flat_seq("ACAGATTGTT") ##################### # format_gene_type(gene_type)
def aTest(self,filename,expectedValue): S = Seq() S.chargerGrille(filename) S.trouvePuits() self.assertEqual(S.solutionExiste(),expectedValue)
def build_fasta_from_scaff_gff(infasta_s,gff,contig_prefix='',include_singletons=True,ol_minID=0.9,outfile=None,mum_len='4'): '''takes scaffolding information from gff of the form generated by get_scaff_from_minimus builds a single assembly for all scaffold instructions pertaining to seqids in infasta if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly ol_minID is the minimum %ID accepted for overlaps in contigs ''' if isinstance(infasta_s,str): infasta = Seq.Fasta(infasta_s) else: infasta = deepcopy(infasta_s) suffixes = ['','b','c','d','e','f'] current_suffix = '' #use only scaffolding info relevant to the specified infasta in_ids = infasta.seq_names() this_gff = [r for r in gff if r['seqid'] in in_ids] #use only scaffolding info that joins 2 or more seqs contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff],0) for r in this_gff: contigs[r['attribute_contig']] += 1 #get final ordered scaffolding layout this_gff = sorted([r for r in this_gff if contigs[r['attribute_contig']] > 1], key=lambda r: (r['attribute_contig'],int(r['attribute_cstart']),int(r['attribute_cend']))) #extract sequences and orient for scaffolding assem_frags = infasta.substr_from_gff(this_gff,plus_strand=True,name_key=None) assem = Seq.Fasta() for k,v in contigs.items(): if v>1: assem[contig_prefix+k] = Seq.Sequence('') if this_gff: for i,r in enumerate(this_gff[:-1]): next = this_gff[i+1] if r['attribute_contig'] == next['attribute_contig']: s1,e1,s2,e2 = [int(n) for n in [r['attribute_cstart'],r['attribute_cend'],next['attribute_cstart'],next['attribute_cend']]] print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % (s1,e1,s2,e2,r,next) if e1 > s2: print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % (assem_frags[r['seqid']][(s2-s1):],assem_frags[next['seqid']][:(e1-s2+1)]) fa1 = Seq.Fasta() fa2 = Seq.Fasta() fa1['seq1'] = assem_frags[r['seqid']][(s2-s1):] fa2['seq2'] = assem_frags[next['seqid']][:(e1-s2+1)] shorter = min(len(fa1['seq1']),len(fa2['seq2'])) mums = Aln.mum(fa1,fa2,mumargs={'-l':'%s' % int(mum_len)})[0] match = float(sum([mumr['score'] for mumr in mums])) if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \ (match/shorter >= ol_minID) or \ (fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \ (Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])): assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']][:s2] else: #implement record of splitting into a/b/etc fragments! print >> sys.stdout,fa1,'\n',fa2,'\n',mums current_suffix = suffixes[suffixes.index(current_suffix)+1] print >>sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % (e1-s2,match/(e1-s2),ol_minID,current_suffix) assem[contig_prefix+r['attribute_contig']+current_suffix] = assem_frags[r['seqid']] else: spacer = Seq.Sequence('n'*(s2-e1)) assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']] + spacer else: assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']] current_suffix = '' assem[contig_prefix+this_gff[-1]['attribute_contig']+current_suffix] += assem_frags[this_gff[-1]['seqid']] if include_singletons: singletons = dict([(k,v) for k,v in infasta.items() if not k in [r['seqid'] for r in this_gff]]) assem.update(singletons) if outfile: assem.write_to_file(outfile) return assem
#!/usr/bin/env python #-*- coding:utf-8 -*- from PyQt4.QtGui import * from PyQt4.QtCore import * from Seq import * PDPI = 0.0 DPI = 0.0 DPMM = 0.0 symbols = dict() icons = dict() shortcuts = dict() defaultTextStyles = [] docName = QString() dataPath = QString() mscoreGlobalShare = QString("share\\") mscore = 0 gscore = 0 seq = Seq() recentScores = QStringList() revision = QString() instrumentGroups = list() articulation = list() actions = dict()
def run(query, target, e_val=10, outfile=None, fastapath=None, format=10, cleanup=True): """Returns a dict keyed by (seqname1,seqname2) pairs containing alignment data. * query - Seq object or filename of fasta format sequences * target - list of Seq objects or filename of fasta format sequences * e_val - (see FASTA3* documentation) * outfile - name of output file * outdir - directory to write fasta output * fastapath - name of directory containing fasta3* executable * format - (see FASTA3* documentation) * cleanup - if True, delete fasta output file """ # see http://helix.nih.gov/apps/bioinfo/fasta3x.txt # format = 10 for machine-readable alignments, 0 for traditional aligns if outfile: outdir = os.path.abspath(os.path.split(outfile)[0]) else: outdir = TEMPDIR outfile = os.path.join(outdir, randomname(12)+ALIGN_SUFFIX) query_file, query_is_file = get_path_or_write_file(query, outdir) target_file, target_is_file = get_path_or_write_file(target, outdir) fasta_prog = Seq.find_exec('fasta35', fastapath) if fasta_prog is None: raise OSError('fasta35 could not be found') # -A Force Smith-Waterman alignment # -H Omit Histogram # -q Quiet - does not prompt for any input. # -m format # -z 0 estimates the significance of the match from the mean and standard deviation of the library scores, without correcting for library sequence length. # -d number of sequences to display # -E maximum expect value to display fastacmd = ' '.join(""" %(fasta_prog)s -A -H -q -z 0 -m %(format)s -O %(outfile)s %(query_file)s %(target_file)s""".split()) cmd = fastacmd % locals() log.info( cmd ) cmd_output = commands.getoutput(cmd) log.debug(cmd_output) # check for successful execution if not os.access(outfile,os.F_OK): log.critical('The following command failed:') log.critical(cmd) log.critical('...with output:') log.critical(cmd_output) raise Seq.ExecutionError(cmd_output) # parse the data data = parseFasta(open(outfile).read()) if cleanup: if not query_is_file: os.remove(query_file) if not target_is_file: os.remove(target_file) os.remove(outfile) query_file = target_file = outfile = None for k in data.keys(): data[k]['file_q'] = query_file data[k]['file_t'] = target_file data[k]['file_out'] = outfile return data
def posterior(seq, emission_mat, transition_mat, k_counter, seeds, rec_num, counter): """ calculates the most probable state for every base in seq :param seq: sequence :param emission_mat :param transition_mat :param k_counter: num of states :return: seq of states, aligned to original seq """ N = len(seq) forward_table = forward(seq, emission_mat, transition_mat, k_counter) backward_table = backward(seq, emission_mat, transition_mat, k_counter) posterior_table = forward_table + backward_table # motif_order = EMPTY_STRING seq_obj = Seq(N - 2, rec_num) last_motif_0 = FIRST_MOTIF_STATE + len(seeds[0]) - 1 first_motif_1 = last_motif_0 + 1 last_motif_1 = first_motif_1 + len(seeds[1]) - 1 first_motif_2 = last_motif_1 + 1 last_motif_2 = first_motif_2 + len(seeds[2]) - 1 # decide states for j in range(1, N - 1): curr_k = int(np.argmax(posterior_table[:, j])) if FIRST_MOTIF_STATE <= curr_k <= last_motif_0: # motif_order += MOTIF_0 seq_obj.add_motif_base(0, (seq[j], curr_k - FIRST_MOTIF_STATE), j - 1) elif first_motif_1 <= curr_k <= last_motif_1: # motif_order += MOTIF_1 seq_obj.add_motif_base(1, (seq[j], curr_k - first_motif_1), j - 1) elif first_motif_2 <= curr_k <= last_motif_2: # motif_order += MOTIF_2 seq_obj.add_motif_base(2, (seq[j], curr_k - first_motif_2), j - 1) elif curr_k == 2: # motif_order += TELO_BACKGROUND seq_obj.add_telo_background(seq[j], j - 1) elif curr_k == 1: # motif_order += 'P' seq_obj.add_pre_telo((seq[j], curr_k)) else: # motif_order += BACKGROUND seq_obj.add_normal_dna_base((seq[j], curr_k)) # print_results(seq[1:-1], motif_order) seq_obj.print_statistics(doc=None, counter=counter) seq_obj.save_to_file() return
import os import unittest import logging import pprint import config import Seq log = logging module_name = os.path.split(sys.argv[0])[1].rstrip('.py') outputdir = config.outputdir datadir = config.datadir cmbuild_path = Seq.find_exec('cmbuild') class TestInfernalInstalled(unittest.TestCase): def test1(self): if not cmbuild_path: log.error('Infernal software could not be found - skipping tests in this module') class Test_Run(unittest.TestCase): def setUp(self): self.funcname = '_'.join(self.id().split('.')[-2:]) self.outfile = os.path.join(outputdir,self.funcname) self.has_space = os.path.join(outputdir,'name with spaces') self.no_space = os.path.join(outputdir,'nameWithoutSpaces') os.system('echo `date` has space > "%s"' % self.has_space)