示例#1
0
def write_fasta_file(cs_file, target, args, configs):

	from cs import TalosCSFile
	tab=TalosCSFile()
	tab.read_file( cs_file )
	sequence=tab.sequence
	fasta_file=target+'.fasta'

	if args.fasta:
		print 'read fasta sequence from %s'%args.fasta
		target_fasta=fasta.read_fasta(args.fasta)
		print target_fasta
		if not fasta.compare_fasta( target_fasta, sequence, strict_length=True ):
			sys.exit("\n".join([
						'fasta-sequence does not match sequence in chemical shift file!!! ',
						'FASTA: '+target_fasta,
						'CS:    '+sequence]))

		sequence=fasta.fill_gaps(target_fasta,sequence)

	if path.exists( fasta_file ):
		target_fasta=fasta.read_fasta(fasta_file)
		if not '-' in sequence and target_fasta!=sequence:
			print "inconsistent fasta sequence: between chemical shifts %(cs_file)s and fasta file %(fasta_file)s"%locals()
			print "will overwrite fasta file, and create backup of original fasta file .bak"
			shutil.copy(fasta_file,fasta_file+".bak")
			fasta.write_fasta(fasta_file, sequence, configs.target_name )

	fasta_file=target+'.fasta'
	if '-' in sequence:
		exit('require full sequence information, provide fasta file with -fasta')

	fasta.write_fasta(fasta_file, sequence, configs.target_name )

	return fasta_file
示例#2
0
 def testGcContentPercentageFromFile(self):
     with open('data/gcContent.fasta') as fp:
         gcContents = {name: gcContentPercentage(seq) 
                            for name, seq in read_fasta(fp)}
         maxName, maxGc = max(gcContents.iteritems(), key=operator.itemgetter(1))
         self.assertEqual('Rosalind_6020', maxName)
         self.assertAlmostEqual(51.881994, maxGc, places=5)
示例#3
0
def main(m5, fa, method, trim_suffix=False):
  read_names = None
  if method in ["sprai"]:
    if fa is None:
      raise Exception("A matching uncorrected fasta file is required to convert sprai indexed names back to their original")
    import fasta
    _, read_names = fasta.read_fasta(fa)

  for line in open(m5):
    fields = line.strip().split()
    n = fields[0]

    if '/' in n and trim_suffix:
      n = n[:n.rindex('/')]

    if method is not None:
      if method in ["ectools"]:
        n = n[:n.rindex("_corrected")]
      elif method == "sprai":
        n = read_names[int(n[:n.index('/')]) - 1] # !! reads are 1-indexed as of sprai v0.9.9.23
      elif method in ["nanocorr"]:
        n = n[:n.rindex("_consensus")]
      elif method in ["lorma"]:
        if '_' in n:
          n = n[:n.rindex("_")] # lorma adds a "_<index>" to each read, which increments from 1 for each subread it was split into
    fields[0] = n
    print ' '.join(fields)
示例#4
0
 def testGcContentPercentageFromFile(self):
     with open('data/gcContent.fasta') as fp:
         gcContents = {
             name: gcContentPercentage(seq)
             for name, seq in read_fasta(fp)
         }
         maxName, maxGc = max(gcContents.iteritems(),
                              key=operator.itemgetter(1))
         self.assertEqual('Rosalind_6020', maxName)
         self.assertAlmostEqual(51.881994, maxGc, places=5)
示例#5
0
	def setUpClass(cls):
	#setup code that really should only run once
		print 'initialize ScoreFunctionTestCase...'
		cls.sequence=fasta.read_fasta( data_path+'gmr137.fasta' )
		cls.molecule=AtomTree.from_sequence( cls.sequence )
		cls.peak_collection=PeakCollection()
		peaks=['aroC.peaks','n.peaks']
		for p in peaks:
			file=data_path+'assigned/'+p
			name=path.splitext(path.basename(file))[0]
			cls.peak_collection.add_experiment( PeakList.read_from_stream( name, open(file,'r'), False ) )
		cls.assignments=AssignmentCollection.from_hard_assignments( cls.peak_collection, cls.molecule )
		cls.some_to_remove=list([ x for i,x in enumerate( cls.assignments ) if i<100])
示例#6
0
def GenerateLastNts(fasta_file, length=150):
    output_file = fasta_file
    output_file += ".-%d" % length
    fa = fasta.read_fasta(fasta_file)

    lastXnt_dic = {}

    fout = open(output_file, "w")
    for id in fa:
        lastXnt = fa[id][-length:].upper()
        print >> fout, ">%s\n%s" % (id, lastXnt)  #, lastXnt in lastXnt_dic
        lastXnt_dic[lastXnt] = id
    fout.close()

    return output_file
示例#7
0
def RNAMapping(fastq_file, fasta_file, output_file):
    '''
        Input format: fastq.gz, fa file
        Output format: Blast matrix format 
    
        python library_checker.py RNA_mapping ../2017-10-19_MiSeq/S1_W_R1.fastq.gz ../data/KRP_set.fa > ./output/2017-10-19_MiSeq/Blastn/S1_W_R1.exact 
        python library_checker.py RNA_mapping ../2017-10-19_MiSeq/S2_WH_R1.fastq.gz ../data/KRP_set.fa > ./output/2017-10-19_MiSeq/Blastn/S2_WH_R1.exact 
    '''
    #===============================
    # Read Fasta file (RNA) and make motifs for exact search
    #===============================
    # GCAGGCATGCAAGCTGCC
    #ggcagcttgcatgcctg
    #gctagaactagtggatccc
    # GCAGGCATGCAAGCTGCC TCCTCGTTCATGGGGAATAATTGCAATCCCCGATCCCCAT GGGATCCACTAGTTCTAGCCGG "
    prefix = "GCAGGCATGCAAGCTGCCCGGG"
    motifs = {}
    fa = fasta.read_fasta(fasta_file)  # RNA fasta file
    for id in fa:
        seq = fa[id].upper()
        rc_seq = ReverseComplement(seq)
        motif = prefix + rc_seq[:
                                50]  # using only 50 nt for all cases (This can be chaged)
        motifs[motif] = id

    #===============================
    # Read Fastq file (RNA, Read1) and Searching id with exact matching the pattern (motif)
    #===============================
    fo = open(output_file, "w")

    cnt_dic = {}
    read1 = ReadSequenceFile(fastq_file)
    for line in read1.stdout.xreadlines():
        id = line.split()[0]
        seq = read1.stdout.next()[:-1]
        read1.stdout.next()
        read1.stdout.next()
        for motif in motifs:
            if seq[:len(motif)] == motif:
                print >> fo, "%s\t%s\t100.00\t%d\t0\t0\t1\t50\t50\t1\t1e-10\t100.0" % (
                    id, motifs[motif], len(motif))
                # @M03766:67:000000000-BGPDM:1:1101:14936:1730	1NYB_A:B	100.00	24	0	0	5	28	24	1	9e-10	48.1
                # @M03766:67:000000000-BGPDM:1:1101:16561:1732	2PJP_A:B	100.00	23	0	0	5	27	23	1	3e-09	46.1
                cnt_dic[id] = cnt_dic.get(id, 0) + 1
                break
    fo.close()
    read1.stdout.close()
    read1.kill()
示例#8
0
						res_to_add.append(noesy.Resonance(atom=Atom(name,res.resid()),freq=res.freq(),error=res.error()))
	for res in res_to_add:
		#print res.atom().resid()
		resonances.add_resonance(res)

def initial_assign(peak,molecule,fm,known_dist):
	for match in random_items( peak.matches( molecule, frequency_matcher=fm, distance_matcher=known_dist ), 1 ):
		if match:
			return match
	return None

#ref_resonances = noesy.ResonanceList.read_from_stream( open(args.ref_prot,'r') )
resonances = noesy.ResonanceList.read_from_stream( open(args.prot,'r') )
unpack_unmethyl_atom_pool(resonances)
#resonances = filter_resonances( ref_resonances )
sequence=fasta.read_fasta(args.fasta)
resonances.set_sequence(sequence)
peaks = PeakCollection.from_peak_files( args.peaks, ignore=True )
molecule=AtomTree.from_sequence( resonances.sequence() )
state=AssignmentCollection( peaks, molecule )
scorefxn=ScoreFunction(bmrb=1,consistency=1,symmetry=1)

import random
from assignment import ConstantFreqMatcher
peak_order = [ peak for peak in peaks ]
#random.shuffle( peak_order )
fm=ConstantFreqMatcher( resonances )
known_dist=ScoreDistanceMatcher( ConformationDistanceScore(), abs(math.log(0.3)), 0 )
#known_dist.max_sequence_separation=9
count=0
for peak in peak_order:
示例#9
0
def main(unc, cor, fa, method, verbose=False):

    if verbose:
        print "Reading pacbio fasta"
    pacbio_reads, names = fasta.read_fasta(fa)

    cor_aligned = 0
    unc_aligned = 0

    tp = 0
    fp = 0
    fn = 0
    #ne = 0
    tn = 0
    '''
  From ec_toolkit compute-stats.py:

  errorStats['TP'] += len(errPreCorrect.difference(errPostCorrect))
  errorStats['FP'] += len(errPostCorrect.difference(errPreCorrect))
  errorStats['FN'] += len(errPreCorrect.intersection(errPostCorrect))
  errorStats['NE'] += getNumWrongBase(errPreCorrect,errPostCorrect)

  # apparently, NE is the number of bases changed, but still incorrect
  '''
    '''
  From Error Correction Toolkit paper:

  We use the following measures for each program:
  number of erroneous bases identified and
  successfully corrected (true positives, TP), correct
  bases wrongly identified as errors and changed
  (false positives, FP), and erroneous bases that were
  either uncorrected or falsely corrected (false negatives,
  FN). We report sensitivity and specificity for
  each program. Then, we combine these into the gain
  metric [21], defined by gain = (TP - FP) /
  (TP + FN), which is the percentage of errors
  removed from the data set by the error-correction
  program. A negative gain value indicates that more
  errors have been introduced due to false corrections,
  which is not captured by measures such as sensitivity
  and specificity.
  '''

    cor_iter = aln_formats.iter_m5(cor, 0, 0, 1000000000, by_query=True)
    uncor_iter = aln_formats.iter_m5(unc, 0, 0, 1000000000, by_query=True)

    cor_query = None
    uncor_query = None

    correct_uncorrected = 0
    incorrect_uncorrected = 0
    correct_corrected = 0
    incorrect_corrected = 0

    # ------ !! keep track of loci on the target sequence since coordinates on the query sequence will change dramatically ------
    # ------ !! although if sequences align to different places, everything will go to crap ------

    while True:
        if cor_query is not None and cor_query == uncor_query and cor_best_aln.target.name == uncor_best_aln.target.name:

            if verbose:
                print
                print "{} aligned for both uncorrected and corrected".format(
                    cor_query)
                print "{} errors in uncorrected read".format(
                    len(incorrect_loci_in_uncorrected))
                print "{} errors in corrected read".format(
                    len(incorrect_loci_in_corrected))

            read_tp = len(prev_incorrect & now_correct)
            read_fp = len(prev_correct & now_incorrect)
            read_fn = len(prev_incorrect & now_incorrect)
            read_tn = len(prev_correct & now_correct)

            # not obviously trivial to compute this using set operations
            #read_ne = 0

            tp += read_tp
            fp += read_fp
            fn += read_fn
            tn += read_tn

        if (uncor_query is None or cor_iter is None
                or uncor_query <= cor_query) and uncor_iter is not None:
            try:
                uncor_query, uncor_aln = uncor_iter.next()
            except:
                uncor_iter = None
                if cor_iter is None:
                    break
            unc_aligned += 1
            uncor_best_aln = sorted(
                uncor_aln,
                key=lambda al:
                (al.accuracy() * abs(al.query.end - al.query.start)))[
                    -1]  # keep only "best" alignment, by total correct bp
            incorrect_loci_in_uncorrected = []
            correct_loci_in_uncorrected = []

            # get all incorrect loci in uncorrected alignment
            tpos = uncor_best_aln.target.start
            for i in xrange(len(uncor_best_aln.alignment)):
                if uncor_best_aln.alignment[i] == '|':
                    correct_loci_in_uncorrected.append(tpos)
                else:
                    if uncor_best_aln.target.alignment[i] == '-':
                        incorrect_loci_in_uncorrected.append(
                            "{}i{}".format(
                                uncor_best_aln.query.alignment[i],
                                tpos))  # <nucleotide> inserted before tpos
                    else:
                        incorrect_loci_in_uncorrected.append(
                            "x{}".format(tpos)
                        )  # <nucleotide> mismatch or deleted at tpos
                if uncor_best_aln.target.alignment[i] != '-':
                    tpos += 1
            incorrect_loci_in_uncorrected.extend(
                range(uncor_best_aln.target.start - uncor_best_aln.query.start,
                      uncor_best_aln.target.start) +
                range(
                    uncor_best_aln.target.end, uncor_best_aln.target.end +
                    uncor_best_aln.query.length - uncor_best_aln.query.end)
            )  # finagle an estimate of the target regions that are supposed to be covered by the read

            prev_incorrect = set(incorrect_loci_in_uncorrected)
            prev_correct = set(correct_loci_in_uncorrected)
            incorrect_uncorrected += len(prev_incorrect)
            correct_uncorrected += len(prev_correct)

        else:
            try:
                cor_query, cor_aln = cor_iter.next()
            except:
                cor_iter = None
                if uncor_iter is None:
                    break
            cor_aligned += 1
            cor_best_aln = sorted(
                cor_aln,
                key=lambda al:
                (al.accuracy() * abs(al.query.end - al.query.start)))[
                    -1]  # keep only "best" alignment, by total correct bp
            incorrect_loci_in_corrected = []
            correct_loci_in_corrected = []

            # get all incorrect loci in corrected alignment
            tpos = cor_best_aln.target.start
            for i in xrange(len(cor_best_aln.alignment)):
                if cor_best_aln.alignment[i] == '|':
                    correct_loci_in_corrected.append(tpos)
                else:
                    if cor_best_aln.target.alignment[i] == '-':
                        incorrect_loci_in_corrected.append(
                            "{}i{}".format(
                                cor_best_aln.query.alignment[i],
                                tpos))  # <nucleotide> inserted before tpos
                    else:
                        incorrect_loci_in_corrected.append("{}".format(
                            tpos))  # <nucleotide> mismatch or deleted at tpos
                if cor_best_aln.target.alignment[i] != '-':
                    tpos += 1
            incorrect_loci_in_corrected.extend(
                range(cor_best_aln.target.start - cor_best_aln.query.start,
                      cor_best_aln.target.start) + range(
                          cor_best_aln.target.end, cor_best_aln.target.end +
                          cor_best_aln.query.length - cor_best_aln.query.end)
            )  # finagle an estimate of the target regions that are supposed to be covered by the read

            now_incorrect = set(incorrect_loci_in_corrected)
            now_correct = set(
                correct_loci_in_corrected
            )  # these should already be unique, but we need them to be sets to do set operations
            incorrect_corrected += len(now_incorrect)
            correct_corrected += len(now_correct)

    if tp + fn == 0:
        raise Exception("No read names matched (uncor: {}, cor: {})".format(
            uncor_query, cor_query))

    gain = float(tp - fp) / (tp + fn)

    print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tUncorrected wrong bp\tUncorrected right bp\tCorrected wrong bp\tCorrected right bp\tTP\tFP\tTN\tFN\tSensitivity\tSpecificity\tGain"
    print "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % (
        fa, cor, unc_aligned, cor_aligned, (cor_aligned - unc_aligned),
        incorrect_uncorrected, correct_uncorrected, incorrect_corrected,
        correct_corrected, tp, fp, tn, fn, (float(tp) / (tp + fn)),
        (float(tn) / (tn + fp)), gain)
示例#10
0
            r += 1

    return r

complement = {"A":"T","T":"A","G":"C","C":"G"}

def revcomp(s):
    r = ""
    for i in range(len(s)):
        r = complement[s[i]] + r

    return r

infile = "rosalind_corr.txt"
with open(infile,"r") as f:
    dnas,_ = fasta.read_fasta(f)
    count = {}
    for s in dnas.values():
        if count.get(revcomp(s),0) != 0:
            count[revcomp(s)] += 1
        else:
            count.setdefault(s,0)
            count[s] += 1

adjs = {}
for s in count.keys():
    adjs[s] = set()
    for s2 in count.keys():
        if s == s2:
            continue
示例#11
0
import fasta

infile = "rosalind_mult.txt"
with open(infile,"r") as f:
    dnas,keys = fasta.read_fasta(f)

ss = []
for k in keys:
    ss.append(dnas[k])

d = {}
p = {}

INF = 10000000

print(ss)

def score(c):
    r = 0
    for i in range(len(c)):
        for j in range(i+1,len(c)):
            if c[i] != c[j]:
                r -= 1

    return r


for i0 in range(-1,len(ss[0])):
    for i1 in range(-1,len(ss[1])):
        for i2 in range(-1,len(ss[2])):
            for i3 in range(-1,len(ss[3])):
示例#12
0
 def testReadFasta(self):
     with open('data/gcContent.fasta') as fp:
         seqs = list(read_fasta(fp))
         self.assertIsNotNone(seqs)
         self.assertEqual(7, len(seqs))
示例#13
0
#!/usr/bin/env python

import fasta
import rna_transcription
import protein_translation


def rna_splicing(dnas):
    s = dnas.popitem(False)[1]
    for sub in dnas.itervalues():
        s = s.replace(sub, '')
    return protein_translation.encode_strand(
        rna_transcription.transcribe_rna(s))


if __name__ == '__main__':
    import sys

    f = sys.stdin if len(sys.argv) == 1 else open(sys.argv[1])

    print rna_splicing(fasta.read_fasta(f))
示例#14
0
def motif_locations(fasta_file):
    data = fasta.read_fasta(fasta_file).popitem()[1]
    return [m.start(0) + 1 for m in n_glycosylation_motif.finditer(data, overlapped=True)]
示例#15
0
#!/usr/bin/env python

from __future__ import division

from fasta import read_fasta

def gc_percentage(strand):
    return sum(c in ['C', 'G'] for c in strand) / len(strand) * 100 \
        if strand else 0


if __name__ == '__main__':
    import sys

    strands = read_fasta(open(sys.argv[1]))
    gc_content = dict((name, gc_percentage(strands[name])) for name in strands)
    max_strand = max(gc_content, key=gc_content.get)
    print max_strand
    print "%2.6f%%" % gc_content[max_strand]
#!/usr/bin/env python

import sys
import fasta

file = sys.argv[1]
temp = file.split('.')
filename_base = temp[0]
tag = temp[1]

sequences = fasta.read_fasta(open(file, 'r').readlines())

count = 1
for i in sequences:
    f = filename_base + '_' + str(count) + '.' + tag
    output = open(f, 'w')
    output.write(i.name + '\n')
    output.write(i.sequence)
    count += 1
示例#17
0
文件: sam2tef.py 项目: wenmm/lrc_eval
def main(sam, tef, rename, fa):

  read_names = None
  if rename in ["sprai"]:
    if fa is None:
      raise Exception("A matching uncorrected fasta file is required to convert sprai indexed names back to their original")
    import fasta
    _, read_names = fasta.read_fasta(fa)

  fout = open(tef, 'w')
  l = 0

  for query_name, alignments in aln_formats.iter_maf(maf, 0, 0, 1000000000, by_query=True):
    al = sorted(alignments, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp
    #fout.write('\n' + str(al))

    '''
TEF (format)

readid n-errors [pos tb wb ind]+

In the above format, the fields are described as below :

Fields    Description
readid    ID of the read corrected
n-errors  Integer. Number of errors corrected in the read.
pos       Position for fix (0 < = pos < length of the read)
tb        true value of the base at pos.
wb        wrong value of the base at pos.
          wb should be current base at read
          tb,wb is one of {0,1,2,3,4,5}
          0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T', 5 = '-'
ind       indicates the type of error. one of {0,1,2}
          0 substitution (bad char in the read at pos) or
          1 deletion (missing char in the read after pos) or
          2 insertion (extra char in the read at pos)
    '''

    n_errs = 0
    err_strings = []
    q = al.query.start - 1
    t = al.target.start - 1
    # whole bunch of ambiguity codes will all map to N (they are present in the a_thaliana reference...)
    almap = {'A':0, 'C':1, 'G':2, 'T':3, 'N':4, '-':5, 'R':4, 'Y':4, 'S':4, 'W':4, 'K':4, 'M':4, 'B':4, 'V':4, 'D':4, 'H':4}
    for i in xrange(len(al.query.alignment)):
      qlocus = al.query.alignment[i]
      tlocus = al.target.alignment[i]
      if qlocus == tlocus:
        continue

      n_errs += 1
      if tlocus == '-':
        ind = 2
        q += 1
      elif qlocus == '-':
        ind = 1
        t += 1
      else:
        ind = 0
        q += 1
        t += 1
      err_strings.append("%i %i %i %i" % (q, almap[tlocus.upper()], almap[qlocus.upper()], ind))

    if rename is not None:
      if rename in ["ectools"]:
        query_name = query_name[:query_name.rindex("_corrected")]
      elif rename == "sprai":
        query_name = read_names[int(query_name[:query_name.index('/')])]
      elif rename in ["nanocorr"]:
        query_name = query_name[:query_name.rindex("_consensus")]

    tef_line = "%s %i %s" % (query_name, n_errs, ' '.join(err_strings))
    fout.write(('\n' if l > 0 else '') + tef_line)
    l += 1

  fout.close()
示例#18
0
#!/usr/bin/env python
'''
simple to translate dna into proteins
'''

#importing the dnatranslate module
import dnatranslate
import sys
import fasta

#opening and reading the file in one take
dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines())

#iterate over the sequences and translate them
for item in dna:
    protein = dnatranslate.translate_dna(item.sequence)
    print item.name
    print protein
示例#19
0
#import fasta import read_fasta

import fasta
import codon



data = fasta.read_fasta('DNA.fasta')
nama    = data[0][0]
sekuens = data[0][1]



#transkripsi
mRNA = sekuens.replace('T','U')
print mRNA



#cari posisi start codon
start = mRNA.find('AUG')
print "Start codon ada di posisi %d" %(start)


#pecahin mRNA per tiga huruf, mulai dari start kodon
prot = ""
while start <= len(mRNA):
        kodon = mRNA[start:start+3]
        print kodon

示例#20
0
 def testReadFasta(self):
     with open('data/gcContent.fasta') as fp:
         seqs = list(read_fasta(fp))
         self.assertIsNotNone(seqs)
         self.assertEqual(7, len(seqs))
示例#21
0
import fasta

infile = "rosalind_pmch.txt"
#infile = "rosalind_cat.txt"

with open(infile,"r") as f:
    rnas,key = fasta.read_fasta(f)
    rna = rnas[key[0]]

n = len(rna)
nA = len(filter(lambda x:x == "A",rna))
nG = len(filter(lambda x:x == "G",rna))

def fac(n):
    if n == 0:
        return 1
    return n*fac(n-1)

print(nA,nG)
print(fac(nA)*fac(nG))
示例#22
0
#!/usr/bin/env python2.7
##-*- mode:python;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t;python-indent:2 -*-'
from sys import argv
import fasta
from PDB.Polypeptide import one_to_three
assert( len(argv)>3)
rdc_file = argv[1]
fasta_file = argv[2]
orientation = argv[3]

rdc_line=open(rdc_file,'r').readlines()
seq=fasta.read_fasta(fasta_file)

if int(orientation)==1:
	error=3.2
elif int(orientation)==2:
	error=4.5

print "#  First atom      Second atom           RDC   Error  Weight Orientation"
for line in rdc_line:
	tags=line.split()
	resid1=int(tags[0])
	atom1=tags[1]
	resid2=int(tags[2])
	atom2=tags[3]
	rdc_value=float(tags[4])
	aa1=one_to_three(seq[resid1-1])
	aa2=one_to_three(seq[resid2-1])
	print'%5d %4s %3s   %5d %4s %3s      %8.3f  %5.3f  1.000 %5d'%(resid1,aa1,atom1,resid2,aa2,atom2,rdc_value,error,int(orientation))
示例#23
0
#!/usr/bin/python/

import fasta
import sys

fastafile = open("test.fas", "r").readlines()
my_sequences = fasta.read_fasta(fastafile)

#test_list = ['ATATAG', 'TATA', 'GGGTGA']

#to make a dictionary of all possible hexamers
all_6mer = {}
base = ['A','C','G','T']
for C1 in base:
	for C2 in base:
		for C3 in base:
			for C4 in base:
				for C5 in base:
					for C6 in base:
						all_6mer[''.join([C1,C2,C3,C4,C5,C6])] = 0

length = len(all_6mer)

keys = all_6mer.keys()

for i in my_sequences:
	ind_sequence = i.sequence
	print i.name
	for keys in all_6mer:
		match = {}
		if keys in ind_sequence:
示例#24
0
def load_fasta_db_into_proteins(
    proteins, fasta_db, clean_seqid=None, iso_leu_isomerism=False):
  seqids, fastas = fasta.read_fasta(fasta_db)
  load_fastas_into_proteins(proteins, fastas, clean_seqid, iso_leu_isomerism)
示例#25
0
number_C_H={'A':1,
            'R':3,
            'N':1,
            'D':1,
            'C':1,
            'Q':2,
            'E':2,
            'G':0,
            'H':4,
            'I':4,
            'L':4,
            'K':4,
            'M':3,
            'F':6,
            'P':3,
            'S':1,
            'T':2,
            'W':6,
            'Y':5,
            'V':3}

assert( len(argv)>1)
fasta_file = argv[1]
sequence=fasta.read_fasta(fasta_file)
all_num=0
for aa in sequence:
   all_num+=number_C_H[aa]

print all_num
示例#26
0
def main(maf, tef, rename, fa, untef=None):

    # ------ stats ------
    if untef is not None:
        import fasta
        reads, names = fasta.read_fasta(fa)

        tp = 0
        fp = 0
        fn = 0
        tn = 0

        uncor = {}

        for line in open(untef):
            data = line.strip().split(' ')
            fields = [int(a) for a in data[1:]]
            assert fields[0] == (len(fields) -
                                 1) / 4, "Number of errors does not match list"
            uncor[data[0]] = [
                fields[i:i + 4] for i in xrange(1, len(fields), 4)
            ]

        cor_aligned = 0
    # -------------------

    read_names = None
    if rename in ["sprai"]:
        if fa is None:
            raise Exception(
                "A matching uncorrected fasta file is required to convert sprai indexed names back to their original"
            )
        import fasta
        _, read_names = fasta.read_fasta(fa)

    fout = open(tef, 'w')
    l = 0

    for query_name, alignments in aln_formats.iter_maf(maf,
                                                       0,
                                                       0,
                                                       1000000000,
                                                       by_query=True):
        #al = sorted(alignments, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp

        n_errs = 0
        err_strings = []

        # ------ stats ------
        if untef is not None:
            cor_aligned += 1
            cpos = set()
        # -------------------

        for al in alignments:
            #fout.write('\n' + str(al))
            '''
TEF (format)

readid n-errors [pos tb wb ind]+

In the above format, the fields are described as below :

Fields    Description
readid    ID of the read corrected
n-errors  Integer. Number of errors corrected in the read.
pos       Position for fix (0 < = pos < length of the read)
tb        true value of the base at pos.
wb        wrong value of the base at pos.
          wb should be current base at read
          tb,wb is one of {0,1,2,3,4,5}
          0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T', 5 = '-'
ind       indicates the type of error. one of {0,1,2}
          0 substitution (bad char in the read at pos) or
          1 deletion (missing char in the read after pos) or
          2 insertion (extra char in the read at pos)
      '''

            q = al.query.start - 1
            t = al.target.start - 1
            # whole bunch of ambiguity codes will all map to N (they are present in the a_thaliana reference...)
            almap = {
                'A': 0,
                'C': 1,
                'G': 2,
                'T': 3,
                'N': 4,
                '-': 5,
                'R': 4,
                'Y': 4,
                'S': 4,
                'W': 4,
                'K': 4,
                'M': 4,
                'B': 4,
                'V': 4,
                'D': 4,
                'H': 4
            }
            for i in xrange(len(al.query.alignment)):
                qlocus = al.query.alignment[i]
                tlocus = al.target.alignment[i]
                if qlocus == tlocus:
                    continue

                n_errs += 1
                if tlocus == '-':
                    ind = 2
                    q += 1
                elif qlocus == '-':
                    ind = 1
                    t += 1
                else:
                    ind = 0
                    q += 1
                    t += 1
                err_strings.append(
                    "%i %i %i %i" %
                    (q, almap[tlocus.upper()], almap[qlocus.upper()], ind))

                # ------ stats ------
                if untef is not None:
                    cpos.add(q)
                # -------------------

        if rename is not None:
            if rename in ["ectools"]:
                query_name = query_name[:query_name.rindex("_corrected")]
            elif rename == "sprai":
                query_name = read_names[int(
                    query_name[:query_name.index('/')])]
            elif rename in ["nanocorr"]:
                query_name = query_name[:query_name.rindex("_consensus")]

        tef_line = "%s %i %s" % (query_name, n_errs, ' '.join(err_strings))
        fout.write(('\n' if l > 0 else '') + tef_line)
        l += 1

        # ------ stats ------
        if untef is not None:
            if uncor.has_key(query_name):
                un = uncor[query_name]

                upos = set([u[0] for u in un])

                read_tp = len(upos - cpos)
                read_fp = len(cpos - upos)
                read_fn = len(cpos & upos)
                read_tn = len(reads[query_name]) - read_tp - read_fp - read_fn

                tp += read_tp
                fp += read_fp
                fn += read_fn
                tn += read_tn
        # -------------------

    fout.close()

    # ------ stats ------
    if untef is not None:
        print tp, fp, tn, fn
        gain = float(tp - fp) / (tp + fn)

        print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tTP\tFP\tTN\tFN\tGain\tSensitivity\tSpecificity"
        print "%s\t%s\tN/A\t%i\tN/A\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % (
            fa, tef, cor_aligned, tp, fp, tn, fn, (float(tp) / (tp + fn)),
            (float(tn) / (tn + fp)), gain)
	def __call__(self, infile, outfile, fasta=None ):
			target_fasta=0

			from cs import ProtCSFile
			tab=ProtCSFile()
			tab.read_stream( infile )
			sequence=tab.sequence

			if not sequence and fasta:
				sequence=fasta
				tab.set_sequence(sequence)
			if not sequence and self._args.fasta:
				sequence=fasta.read_fasta(self._args.fasta)
				tab.set_sequence(sequence)

			#combine atoms into QX if possible/necessary
			res_in=noesy.ResonanceList.read_from_prot( tab )
			self.clean_up_names( res_in )

			res_out=noesy.ResonanceList()
			res_out.set_sequence( sequence )
			cyana_ss_constraints = []
			for resid,resonances in res_in.iter_residues():
		#		print 'reso: ',"\n".join(["%s"%r for r in resonances])
				aa=res_in.sequence()[resid-1]

				#copy heavy atoms
				if self._args.v >= 2: print 'residue %d %s round 1...'%(resid,aa)
				pools = self.get_pools( resonances, aa )
				combined_pools, cya_ss = self.combine_pools( pools, aa )
				new_resonances = self.generate_combined_resonances( pools, combined_pools )

				if self._args.v >=2: print 'residue %d %s round 2...'%(resid,aa)
				pools = self.get_pools( new_resonances, aa )
				combined_pools, cya_ss = self.combine_pools( pools, aa )
				new_resonances = self.generate_combined_resonances( pools, combined_pools )
				cyana_ss_constraints += cya_ss

				for r in new_resonances:
					res_out.add_resonance( r )


			prot_data=res_out.generate_dict()
			floats=[]
			ambiguity=[]
			for r in res_out.itervalues():
				ambiguity.append( r.ambiguity )
				try:
					floats.append( r.float_partners_str() )
				except AttributeError as exc:
		#			print exc
					floats.append( None )
			if self._args.stereo:
				prot_data['STEREO']=floats
			if self._args.ambiguity:
				prot_data['AMBIGUITY']=ambiguity

		#	print floats
			nih_table = cs.NIH_table().from_dict( prot_data )
#			print nih_table.vars
#			print nih_table.table
#			print 'convert to ProtCS-File'
			prot_file = cs.ProtCSFile().from_table( nih_table )
			prot_file.write( outfile, header=self._args.header )

			if self._args.cyana_ssa:
				fd = open( self._args.cyana_ssa, 'w')
				for line in cyana_ss_constraints:
					fd.write('%s\n'%line)
示例#28
0
def cluster(read_fa,
            ref_fa,
            aln_file,
            out_prefix,
            verbosity=0,
            st=None,
            en=None):
    # load read and reference sequences
    reads, read_names = fasta.read_fasta(read_fa, split_at_space=True)
    ref, ref_names = fasta.read_fasta(ref_fa)
    ref_name = ref_names[0]
    ref_seq = ref[
        ref_name]  # we assume the ref has only one sequence, or the first is the primary
    ref_name = ref_name.split()[0]

    if st is None:
        st = 0
    if en is None:
        en = len(ref_seq) - 1
    if verbosity > 0:
        print("Assessing {}-{} of {} bp in {}".format(st, en, len(ref_seq),
                                                      ref_names[0]))

    # load or build distance matrix
    dist_matrix_file = "{}.pairwise_distance.npy".format(out_prefix)
    read_name_file = "{}.aligned_reads.txt".format(out_prefix)
    feature_file = "{}.features.npy".format(out_prefix)
    features = None
    try:
        print("Trying to load features and distance matrix...")
        dist = np.load(dist_matrix_file).astype('i4')
        np.save(dist_matrix_file, dist)
        aligned_read_names = open(read_name_file).read().strip().split('\n')
        features = np.load(feature_file)
    except Exception as e:
        print("Missing.")
        print("Computing features...")
        features, aligned_read_names = compute_features(aln_file,
                                                        read_names,
                                                        ref_seq,
                                                        ref_name,
                                                        feature_file,
                                                        binary=True,
                                                        st=st,
                                                        en=en)
        open(read_name_file, 'w').write('\n'.join(aligned_read_names) + '\n')
        print("Computing distances...")
        dist = distance(features)
        np.save(dist_matrix_file, dist)

    # only distances between aligned reads (filter out any rows/cols with any -2)
    aln_indices = [d for d in range(dist.shape[0]) if -2 not in dist[d, :]]
    aln_dist = dist[aln_indices, :][:, aln_indices]
    if aln_dist.shape[0] == 0:
        print("No reads aligned to {} ({} bp) from {} - {}".format(
            ref_name, len(ref_seq), st, en))
        return

    print("Plotting distance distribution...")
    plot_distance_distr(aln_dist, out_prefix)

    compressed_dist_matrix = spadist.squareform(aln_dist)

    print("Agglomerative clustering (linkage)...")
    # does pretty simple agglomerative hierarchical clustering (think neighbor-joining)
    linkage = sch.linkage(
        compressed_dist_matrix, method="ward",
        metric="euclidean")  # same as ward(compressed_dist_matrix)
    np.save("{}.linkage.npy".format(out_prefix), linkage)

    # convert hierarchical clustering (from linkage) to flat clustering:
    n_clusters, cutoff = get_cutoff(linkage,
                                    aln_dist,
                                    out_prefix,
                                    threshold=1000)

    print("Cutoff: {}".format(cutoff))
    cluster_indices = sch.fcluster(
        linkage, cutoff - 1,
        'distance')  # this is the default behavior of dendrogram
    #print(list(cluster_indices))
    ai = np.array(aln_indices)
    np.save("{}.aligned_indices.npy".format(out_prefix), ai)
    np.save("{}.cluster_indices.npy".format(out_prefix), cluster_indices)

    print("Drawing heatmap...")
    draw_heatmap(aln_dist, linkage, out_prefix, cutoff)

    n_indices = len(set(cluster_indices))
    print("{} clusters (indices) found".format(n_indices))

    # ------ PCoA and plot colored by cluster_indices ------
    d = DistanceMatrix(aln_dist)
    pcoa_result = pcoa(d)
    if verbosity > 1:
        print("Proportion explained:", pcoa_result.proportion_explained)
        print("Eigenvalues:", pcoa_result.eigvals)
        print("Samples:", pcoa_result.samples)
        print("Features:", pcoa_result.features)

    x = pcoa_result.samples["PC1"]
    for pc in [2, 3, 4]:
        y = pcoa_result.samples["PC{}".format(pc)]
        plt.clf()
        f, ax = plt.subplots(figsize=(8, 8))
        sn.despine(f)
        sn.scatterplot(x,
                       y,
                       hue=cluster_indices,
                       palette=sn.color_palette("husl", n_indices))
        lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.xlabel("PC1: {:.2f}%".format(
            pcoa_result.proportion_explained["PC1"] * 100))
        plt.ylabel("PC{}: {:.2f}%".format(
            pc, pcoa_result.proportion_explained["PC{}".format(pc)] * 100))
        plt.savefig("{}.pcoa_1_{}.png".format(out_prefix, pc),
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

    # ------ Generate cluster consensus seqs ------
    generate_cluster_seqs(ref_seq, ref_name, aligned_read_names, aln_file,
                          cluster_indices, out_prefix, st, en, verbosity)
示例#29
0
from collections import OrderedDict
from itertools import chain

from fasta import read_fasta

def directed_edges(node, nodes):
    edges = []
    name, strand = node
    for n, s in nodes:
        if strand[-3:] == s[:3]:
            edges.append(n)
    return edges

def all_adjacencies(nodes):
    adjacencies = OrderedDict()
    for name, strand in nodes.iteritems():
        adjacencies[name] = directed_edges((name, strand),
                                           ((n, s) for n, s in nodes.iteritems()
                                            if n != name))
    return adjacencies


if __name__ == '__main__':
    import sys
    
    nodes = read_fasta(open(sys.argv[1]))
    adjacencies = all_adjacencies(nodes)
    for tail in (name for name in adjacencies if adjacencies[name]):
        print '\n'.join("%s %s" % (tail, head) for head in adjacencies[tail])
示例#30
0
#input:
lines = open( args.infile,'r').readlines();

#output:
verbose=1
if args.outfile=="stdout":
	outfile=sys.stdout
	verbose=0
else:
	outfile=open(args.outfile,'w');
	library.hello( __file__ )

fasta=None
if args.fasta:
    fasta=fasta_lib.read_fasta(args.fasta)

sequence="";
end=args.end
if end<=0:
	end=1000000
start=args.start
format="%5d %5s %5s   %5d %5s %5s  %8.3f     %s\n"
try:
	if fasta:
		upl_fasta=fasta_lib.upl2fasta( lines )
		offset=fasta_lib.find_fasta_offset( upl_fasta, fasta, verbose )
		start=offset+1
		end=start+len(fasta)-1

	for line in lines:
示例#31
0
def BLASTN_RNA(fasta1, fasta2, exact_file1, blastn_file2, output_file):
    '''
        parse blastn output and make ppi
    '''

    ppi_cnt_dic = {}
    read1_dic = {}
    read2_dic = {}
    '''
    ['M03766:33:000000000-AT3T3:1:1101:21081:6509', '113', 'NMI', '880', '36', '54S41M', 'IGF2', '452', '0', 'ATTTTGATCATATGACTGCTCTGTTTCATTTTTTTCAATAAACCCTTTACAATTAAGTGTTCTCTAGGTCAACCTCACATAGCATACTTTGAAGA', 'HHFFHHHHFDHHHGHHHHHHHHEHHHHHGGHFHGBHGHHGHHEG4GHHHHHHHHHHHHHFFFG3GEBGBFHHHHGHHHHHGHHFHFHGHHGHHHH', 'AS:i:82', 'XN:i:0', 'XM:i:0', 'XO:i:0', 'XG:i:0', 'NM:i:0', 'MD:Z:41', 'YS:i:174', 'YT:Z:DP']
    ['M03766:33:000000000-AT3T3:1:1101:21081:6509', '177', 'IGF2', '452', '36', '5S87M', 'NMI', '880', '0', 'TCTCTAGGCCAAACGTCACCGTCCCCTGATTGCTCTACCCACCCAAGACCCCGCCCACGGGGGCGCCCCCCCAGAGATGGCCAGCAATCGGA', '/BBB/BBBFFFEFFFEEFAFB?FFFFBFFFFFFFEB;@-DFFFFFFD@FFFFEFFFFAFFFFDAFGCGGHGGHHHHHHHFFHHHGFEGFHHH', 'AS:i:174', 'XN:i:0', 'XM:i:0', 'XO:i:0', 'XG:i:0', 'NM:i:0', 'MD:Z:87', 'YS:i:82', 'YT:Z:DP']^C
    '''
    #if len( sys.argv ) < 2:
    #    print "python SAM.py ../data/roth2016_control_set_plus_control.fa output/2016-12-22_MiSeq/Friedrich/17543_S1.sam"
    #    sys.exit(0)
    total_cnt = 0
    RNA_fa = fasta.read_fasta(fasta1)
    fa = fasta.read_fasta(fasta2)
    filepath1 = exact_file1  # sys.argv[3] # read1 = bait
    filepath2 = blastn_file2  #sys.argv[4] # read2 = prey

    # Read1 = bait = RNA part
    PREV_QNAME = ""
    f = open(filepath1)
    read_cnt = 0
    for line in f.xreadlines():
        #if read_cnt % 10000 == 0: print read_cnt
        read_cnt += 1
        ## READ 1
        # @M03766:53:000000000-B63MG:1:1101:13982:1738	cask_p142	98.969	97	1	0	1	97	99	3	3.06e-50	184
        [
            QNAME, TARGET, PERCENT, LENGTH, MISMATCH, GAPOPEN, QSTART, QEND,
            SSTART, SEND, EVALUE, BITSCORE
        ] = line[:-1].split("\t")
        if QNAME == PREV_QNAME: continue
        if int(SEND) > int(SSTART):
            continue  # don't allow both direction for RNA
        read1_dic[QNAME] = TARGET
        PREV_QNAME = QNAME
    f.close()

    # Read2 = prey = Protein part
    PREV_QNAME = ""
    read_cnt = 0
    f = open(filepath2)
    for line in f.xreadlines():
        #if read_cnt % 10000 == 0: print read_cnt
        read_cnt += 1
        ## READ 2
        # @M03766:53:000000000-B63MG:1:1101:13982:1738  cask_p142       98.969  97      1       0       1       97      99      3       3.06e-50        184
        [
            QNAME, TARGET, PERCENT, LENGTH, MISMATCH, GAPOPEN, QSTART, QEND,
            SSTART, SEND, EVALUE, BITSCORE
        ] = line[:-1].split("\t")
        if QNAME == PREV_QNAME: continue
        if int(SEND) > int(SSTART): continue
        read2_dic[QNAME] = TARGET
        PREV_QNAME = QNAME
    f.close()

    for QNAME in read1_dic:
        TARGET2 = read2_dic.get(QNAME, "")
        if TARGET2 == "": continue
        TARGET1 = read1_dic[QNAME]
        ppi_cnt_dic[(TARGET1, TARGET2)] = ppi_cnt_dic.get(
            (TARGET1, TARGET2), 0) + 1
        total_cnt += 1
    f.close()

    RNA_id_list = RNA_fa.keys()
    RNA_id_list.sort()

    id_list = fa.keys()
    id_list.sort()

    fo = open(output_file, "w")
    print >> fo, "# This file is generated by BLASTN_RNA"
    print >> fo, "DB(Read 1) \ AD(Read 2)\t" + "\t".join(id_list)
    for RNA_id1 in RNA_id_list:
        output = RNA_id1
        for id2 in id_list:
            cnt = ppi_cnt_dic.get((RNA_id1, id2), 0)
            output += "\t%d" % cnt
        print >> fo, output

    fo.close()
示例#32
0
#!/usr/bin/env python

from __future__ import division

from fasta import read_fasta


def gc_percentage(strand):
    return sum(c in ['C', 'G'] for c in strand) / len(strand) * 100 \
        if strand else 0


if __name__ == '__main__':
    import sys

    strands = read_fasta(open(sys.argv[1]))
    gc_content = dict((name, gc_percentage(strands[name])) for name in strands)
    max_strand = max(gc_content, key=gc_content.get)
    print max_strand
    print "%2.6f%%" % gc_content[max_strand]
示例#33
0
from itertools import chain

from fasta import read_fasta


def directed_edges(node, nodes):
    edges = []
    name, strand = node
    for n, s in nodes:
        if strand[-3:] == s[:3]:
            edges.append(n)
    return edges


def all_adjacencies(nodes):
    adjacencies = OrderedDict()
    for name, strand in nodes.iteritems():
        adjacencies[name] = directed_edges(
            (name, strand),
            ((n, s) for n, s in nodes.iteritems() if n != name))
    return adjacencies


if __name__ == '__main__':
    import sys

    nodes = read_fasta(open(sys.argv[1]))
    adjacencies = all_adjacencies(nodes)
    for tail in (name for name in adjacencies if adjacencies[name]):
        print '\n'.join("%s %s" % (tail, head) for head in adjacencies[tail])
示例#34
0
# from fasta import read_fasta

import fasta
import codon

#sekuens = fasta.read_fasta('flu_A.fasta')

#print(sekuens)

data = fasta.read_fasta('flu_A.fasta')

nama = data[0][0]
sekuens = data[0][1]

#Menghitung persentasi A, T, G, C

sum_basa_adenin = sekuens.count("A")
sum_basa_timin = sekuens.count("T")
sum_basa_guanin = sekuens.count("G")
sum_basa_citocin = sekuens.count("C")

total_basa = sum_basa_adenin + sum_basa_timin + sum_basa_guanin + sum_basa_citocin

print ("\n================Persentase A, T, G, C===============")
print ("Persentase A = %.2f%%" %((sum_basa_adenin / total_basa) * 100))
print ("Persentase T = %.2f%%" %((sum_basa_timin / total_basa) * 100))
print ("Persentase G = %.2f%%" %((sum_basa_guanin / total_basa) * 100))
print ("Persentase C = %.2f%%" %((sum_basa_citocin / total_basa) * 100))

#transkripsi
示例#35
0
def main(unc, cor, fa, sorted=False, verbose=False):

    if verbose:
        print "Reading pacbio fasta"
    reads, names = fasta.read_fasta(fa)

    cor_aligned = 0
    unc_aligned = 0

    tp = 0
    fp = 0
    fn = 0
    #ne = 0
    tn = 0
    '''
  From ec_toolkit compute-stats.py:

  errorStats['TP'] += len(errPreCorrect.difference(errPostCorrect))
  errorStats['FP'] += len(errPostCorrect.difference(errPreCorrect))
  errorStats['FN'] += len(errPreCorrect.intersection(errPostCorrect))
  errorStats['NE'] += getNumWrongBase(errPreCorrect,errPostCorrect)

  # apparently, NE is the number of bases changed, but still incorrect
  '''
    '''
  From Error Correction Toolkit paper:

  We use the following measures for each program:
  number of erroneous bases identified and
  successfully corrected (true positives, TP), correct
  bases wrongly identified as errors and changed
  (false positives, FP), and erroneous bases that were
  either uncorrected or falsely corrected (false negatives,
  FN). We report sensitivity and specificity for
  each program. Then, we combine these into the gain
  metric [21], defined by gain = (TP - FP) /
  (TP + FN), which is the percentage of errors
  removed from the data set by the error-correction
  program. A negative gain value indicates that more
  errors have been introduced due to false corrections,
  which is not captured by measures such as sensitivity
  and specificity.
  '''

    if not sorted:
        uncor = {}
        corr = {}

        if verbose:
            print "Reading uncorrected TEF"
        for line in open(unc):
            data = line.strip().split(' ')
            fields = [int(a) for a in data[1:]]
            assert fields[0] == (len(fields) -
                                 1) / 4, "Number of errors does not match list"
            uncor[data[0]] = [
                fields[i:i + 4] for i in xrange(1, len(fields), 4)
            ]

        if verbose:
            print "Reading corrected TEF"
        for line in open(cor):
            data = line.strip().split(' ')
            fields = [int(a) for a in data[1:]]
            assert fields[0] == (len(fields) -
                                 1) / 4, "Number of errors does not match list"
            corr[data[0]] = [
                fields[i:i + 4] for i in xrange(1, len(fields), 4)
            ]

        if verbose:
            print "Some uncorrected reads:"
            print uncor.keys()[:10]
            print
            print "Some corrected reads:"
            print corr.keys()[:10]

        for n in names:
            if not uncor.has_key(n) and not corr.has_key(n):
                continue
            if not uncor.has_key(n) and corr.has_key(n):
                cor_aligned += 1
                continue
            if uncor.has_key(n) and not corr.has_key(n):
                unc_aligned += 1
                continue
            cor_aligned += 1
            unc_aligned += 1
            un = uncor[n]
            co = corr[n]

            if verbose:
                print
                print n
                print "%i errors in uncorrected read" % len(un)
                print "%i errors in corrected read" % len(co)

            cpos = set([c[0] for c in co])
            upos = set([u[0] for u in un])

            read_tp = len(upos - cpos)
            read_fp = len(cpos - upos)
            read_fn = len(cpos & upos)
            read_tn = len(reads[n]) - read_tp - read_fp - read_fn

            # not obviously trivial to compute this using set operations
            #read_ne = 0

            tp += read_tp
            fp += read_fp
            fn += read_fn
            tn += read_tn

    else:  # sorted TEF
        cor_in = open(cor)
        uncor_in = open(unc)

        cor_aligned += 1
        cor_line = cor_in.readline()
        cor_data = cor_line.strip().split(' ')
        #cor_fields = [int(a) for a in cor_data[1:]]

        unc_aligned += 1
        uncor_line = uncor_in.readline()
        uncor_data = uncor_line.strip().split(' ')
        #uncor_fields = [int(a) for a in uncor_data[1:]]

        while len(cor_line) > 0 and len(uncor_line) > 0:

            if cor_data[0] == uncor_data[0]:
                n = cor_data[0]
                #co = [cor_fields[i:i+4] for i in xrange(1, len(cor_fields), 4)]
                #un = [uncor_fields[i:i+4] for i in xrange(1, len(uncor_fields), 4)]
                co = [cor_data[i] for i in xrange(1, len(cor_data), 4)]
                un = [uncor_data[i] for i in xrange(1, len(uncor_data), 4)]

                if verbose:
                    print
                    print n
                    print "%i errors in uncorrected read" % len(un)
                    print "%i errors in corrected read" % len(co)

                #cpos = set([c[0] for c in co])
                #upos = set([u[0] for u in un])
                cpos = set(co)
                upos = set(un)

                read_tp = len(upos - cpos)
                read_fp = len(cpos - upos)
                read_fn = len(cpos & upos)
                read_tn = len(reads[n]) - read_tp - read_fp - read_fn

                # not obviously trivial to compute this using set operations
                #read_ne = 0

                tp += read_tp
                fp += read_fp
                fn += read_fn
                tn += read_tn

            if uncor_data[0] <= cor_data[0]:
                unc_aligned += 1
                uncor_line = uncor_in.readline()
                if len(uncor_line) > 0:
                    uncor_data = uncor_line.strip().split(' ')
                #uncor_fields = [int(a) for a in uncor_data[1:]]

            else:  #if cor_data[0] < uncor_data[0]:
                cor_aligned += 1
                cor_line = cor_in.readline()
                if len(cor_line) > 0:
                    cor_data = cor_line.strip().split(' ')
                #cor_fields = [int(a) for a in cor_data[1:]]

        if cor_line is None:
            while uncor_line is not None:
                unc_aligned += 1
                uncor_line = uncor_in.readline()
        if uncor_line is None:
            while cor_line is not None:
                cor_aligned += 1
                cor_line = cor_in.readline()
        cor_in.close()
        uncor_in.close()

    if tp + fn == 0:
        raise Exception("No read names matched (uncor: {}, cor: {})".format(
            uncor_data[0], cor_data[0]))

    gain = float(tp - fp) / (tp + fn)

    print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tTP\tFP\tTN\tFN\tSensitivity\tSpecificity\tGain"
    print "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % (
        fa, cor, unc_aligned, cor_aligned,
        (cor_aligned - unc_aligned), tp, fp, tn, fn, (float(tp) / (tp + fn)),
        (float(tn) / (tn + fp)), gain)
    '''
示例#36
0
def load_fasta_db_into_proteins(proteins,
                                fasta_db,
                                clean_seqid=None,
                                iso_leu_isomerism=False):
    seqids, fastas = fasta.read_fasta(fasta_db)
    load_fastas_into_proteins(proteins, fastas, clean_seqid, iso_leu_isomerism)
示例#37
0
if verbose:
    library.hello( __file__ )

try:
	target_fasta=0

	sequence="";
	start=args.start
	end=args.end

	from cs import ProtCSFile
	tab=ProtCSFile()
	tab.read_file( args.infile )
	sequence=tab.sequence
	if not sequence and args.correct_fasta:
		sequence=fasta.read_fasta(args.correct_fasta)
		tab.set_sequence(sequence)
	elif sequence and args.correct_fasta:
		sys.stderr('WARNING: overwriting sequence in .prot file with input from -correct_fasta is this really intended?\n')
	if args.fasta:
		if start or end:
			exit('cannot choose -fasta together with -start and -end for trimming')
		if sequence:
			target_fasta=fasta.read_fasta(args.fasta)
			start=-fasta.find_fasta_offset(target_fasta,sequence,verbose)+1
			end=start+len(target_fasta)-1;
		else:
			exit('WARNING: cannot use fasta to trim since there is no sequence information in the .prot file')

	if args.rigid:
		if start or end: exit('cannot choose -fasta together with -start and -end for trimming')
#!/usr/bin/env python

#import two modules
import dnatranslate
import fasta
import sys


#read the fasta file in one line: open the file, read the contents
#and send it to the fasta reading function
dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines())

for item in dna:
    #translate the DNA
    protein = dnatranslate.translate_dna(item.sequence)
    print item.name
    #format and print the protein
    print fasta.format_output(protein, 60)
示例#39
0
    #search all entries of the pattern
    sites = searchpattern.findall(sequence)
    temppos = searchpattern.finditer(sequence)
    for i in temppos:
        begin, end = i.span()
        positions.append(begin)

    return sites, positions

#read the enzyme name
enzyme = sys.argv[1]
#read the list
enzymeset = read_enzymes(open('bionet.709', 'r'))
isname = check_enzyme(enzyme, enzymeset)

if isname:
    print 'Name found'
    #if we found the enzyme name we read the sequence file
    sequences = fasta.read_fasta(open(sys.argv[2], 'r').readlines())
    for item in sequences:
        #let's search
        sites, positions = find_sites(enzyme, enzymeset, item.sequence)
        #print the sequence name
        print item.name[:20]+'...'
        #and use the zip function to combine the lists and print
        for i in zip(sites,positions):
            print i[0], '->', i[1]
#if the name is not found, we bail out
else:
    print 'Enzyme name not found, please try again'
示例#40
0
                    elif star[i] == '-': # gap di star, maka seluruh curr disisipkan gap
                        aligns[k][0] = insert_gap(aligns[k][0], i)
                        aligns[k][1] = insert_gap(aligns[k][1], i)
                        curr = aligns[k][0]
                    elif curr[i] == star[i]:
                        continue
                merged.append(aligns[k][1])

    return merged

if __name__ == "__main__":

    #seq = ["ATTGCCATT", "ATGGCCATT", "ATCCAATTTT", "ATCTTCTT", "ATTGCCGATT"]
    import fasta
    
    fa = fasta.read_fasta('h5n1.fasta')
    
    seq = [fa[0][1][:100],fa[1][1][:100],fa[2][1][:100]]
    
    sim_matrix = {
        ('A','A'): +2,
        ('G','A'): -1,  ('G','G'): +2,
        ('C','A'): -1,  ('C','G'): -1,  ('C','C'): +2,
        ('T','A'): -1,  ('T','G'): -1,  ('T','C'): -1,  ('T','T'): +2
    }
    gap_penalty = -1

    star, aligns = all_pairs(seq, sim_matrix, gap_penalty)
    merged = merge_alignments(star, aligns)

    for m in merged:
示例#41
0
#output:
verbose=1
if args.outfile=="stdout":
	outfile=sys.stdout
	verbose=0
else:
	outfile=open(args.outfile,'w');

####### program start
if verbose:
	library.hello( __file__ )

try:
	sequence=None
	if args.fasta:
		sequence=read_fasta( args.fasta )
	if args.seq:
		sequence=read_aa3_sequence( args.seq)

	prot = ProtCSFile()
	prot.read_file( args.infile, sequence )

	if not sequence:
		sequence = prot.sequence

	talos = TalosCSFile()
	talos.from_table( prot, sequence=sequence )
	talos.write( outfile )


except library.LibException as inst:
示例#42
0
def main(unc, cor, fa):

    print "Reading pacbio fasta"
    reads, names = fasta.read_fasta(fa)

    uncor = {}
    corr = {}

    print "Reading uncorrected alignments"
    for query_name, alignments in aln_formats.iter_maf(unc,
                                                       0,
                                                       0,
                                                       1000000000,
                                                       by_query=True):
        al = sorted(alignments,
                    key=lambda al:
                    (al.accuracy() * abs(al.query.end - al.query.start)))[
                        -1]  # keep only "best" alignment, by total correct bp
        uncor[al.query.name] = al

    print "Reading corrected alignments"
    for query_name, alignments in aln_formats.iter_maf(cor,
                                                       0,
                                                       0,
                                                       1000000000,
                                                       by_query=True):
        al = sorted(alignments,
                    key=lambda al:
                    (al.accuracy() * abs(al.query.end - al.query.start)))[-1]
        corr[al.query.name] = al

    new_aligned = 0
    new_unaligned = 0

    for n in names:
        if not uncor.has_key(n) and not corr.has_key(n):
            continue
        if not uncor.has_key(n) and corr.has_key(n):
            new_aligned += 1
            continue
        if uncor.has_key(n) and not corr.has_key(n):
            new_unaligned += 1
            continue
        un = uncor[n]
        co = corr[n]
        print
        print n
        print un
        print co
        for i in xrange(len(un.query.alignment)):
            qlocus = un.query.alignment[i]
            tlocus = un.target.alignment[i]
            if qlocus == tlocus:
                continue

            n_errs += 1
            if tlocus == '-':
                q += 1
            elif qlocus == '-':
                t += 1
            else:
                q += 1
                t += 1
示例#43
0
from utility import GaussianDistribution
from assignment.noesy import Resonance, Atom


parser = ExampleArgumentParser(prog=basename(__file__), description="make autoNOE-Rosetta readable chemical shift list from any column based format",
examples=['%(prog)s input.prot -fasta input.fasta | awk \'NF>1{print}\'| sort -n -k 5 > proper.prot'])
parser.add_argument("input", help="A shift file");
parser.add_argument("-check", help="check the CS of protons matching bmrb statistics or not, if not, delete.", action='store_true', default=False);
parser.add_argument("-threshold", help="threshold",type=float,default=0.1);
mutex=parser.add_mutually_exclusive_group()
mutex.add_argument("-fasta",help="figure out the sequence");
library.add_standard_args( parser )

args = parser.parse_args()

target_seq=fasta.read_fasta(args.fasta)
resonance_list=noesy.ResonanceList.read_from_stream( open(args.input,'r') )
resonance_list.set_sequence(target_seq)

if args.check:
	delete_res=[]
	data_library=AATypeShiftDistributionLibrary(target_seq)
	if 'csrosettaDir' not in environ:
		print 'Please setup csrosettaDir to your environment'
		exit()
	list=open(environ['csrosettaDir']+"/database/cs_distribution.txt",'r').readlines()
	for line in list:
		tags=line.split()
		if tags[0]=='Res': continue
		data_library.add_distribution(tags[1],three_to_one(tags[0]),GaussianDistribution(float(tags[6]),float(tags[7])))
	for resonance in resonance_list.itervalues():
示例#44
0
	verbose=0
else:
	outfile=open(args.outfile,'w');

####### program start
if verbose:
	library.hello( __file__ )


try:
	target=0
	start=args.start
	end=args.end
  pdb_fasta=fasta.pdb2fasta(args.infile)
	if args.fasta:
		target=fasta.read_fasta(args.fasta)
		start=-fasta.find_fasta_offset(target,pdb_fasta)+1
		end=start+len(target)-1;
    print pdb_fasta
    print '-'*(start-1)+target
    if verbose:
			print "worked out trimming from fasta-sequences: start: %d end: %d"%(start,end)

	if args.rigid:
		start,end=library.read_rigid_file( args.rigid )

	if verbose>0: print 'Will trim from %d to %d'%(start,end)

	if pdb_fasta:
    pdb_fasta, end=fasta.cut_sequence(pdb_fasta,start,end,verbose)
#input: