def test_write_sequences_to_fasta_file(self): """Tests writing to a FASTA file""" seqs = st.read_sequences_from_fasta_file('testdata/fasta_test.fa') with open('/tmp/fasta_tmp.fa', 'w') as outputfile: st.write_sequences_to_fasta_file(outputfile, seqs) seqs2 = st.read_sequences_from_fasta_file('/tmp/fasta_tmp.fa') self.assertEquals(seqs, seqs2)
def test_write_sequences_to_fasta_file_empty_seqs(self): """Tests ensures that only non-empty sequences will be written to FASTA""" seqs = [['seq1', 'TATATA'], ['seq2', '']] with open('/tmp/fasta_tmp.fa', 'w') as outputfile: st.write_sequences_to_fasta_file(outputfile, seqs) seqs2 = st.read_sequences_from_fasta_file('/tmp/fasta_tmp.fa') self.assertEquals(1, len(seqs2)) self.assertEquals(seqs[0][0], seqs2[0][0]) self.assertEquals(seqs[0][1], seqs2[0][1])
def test_read_sequences_from_fasta_file(self): """test reading sequences from a string in FASTA format""" with open("testdata/fasta_test.fa") as inputfile: fasta_string = inputfile.read() seqs = st.read_sequences_from_fasta_file('testdata/fasta_test.fa') self.assertEquals(7, len(seqs)) seq = ("CCGAGGAAGACAGACGCAATTTCACATCGAACTCGTGTACGGCATCCTCT" + "TTATTGCCGGCTTTGCTTTTCTCGTCTTCCGCGTCGATCCCCGGGTGGCA" + "GCGTTCGAAGGAGGTCTCGTCATTGGTTACTTATTGAGAATTTAGGGGAA" + "AATGTCAATCTACGAGTGGA") self.assertEquals('VNG6198H', seqs[6][0]) self.assertEquals(seq, seqs[6][1])
def make_sequences( genome_fasta_file, gene_features_file, outfile='sequences.csv', distance={'upstream':300,'downstream':100}, from_end=False, fasta=False ): if from_end: distance = ( distance['upstream'], distance['downstream'] ) else: '''WARNING: as of 2012-03-22, the st.extract functions used flipped distances! e.g. distance[1] is the UPSTREAM distance and distance[0] is the DOWNSTREAM CHECK YOUR SEQUENCES after running this! Also, a negative number is expected for DOWNSTREAM. So, (-100,300) must be passed to st.extract_upstream in order to get a sequence from 300 upstream to 100 downstream. WEIRD!''' distance = (-1*distance['downstream'],distance['upstream']) contig_sequences = st.read_sequences_from_fasta_file( genome_fasta_file ) # convert contig_sequences to dictionary (this func returns a list of tuples) contig_dict = {} for name, seq in contig_sequences: contig_dict[name] = seq print 'loaded %i contigs' %len(contig_dict) print string.join( [ '%s: %ibp' %(a,len(b)) for a,b in contig_dict.items()] , ',' ) features = st.read_features_from_file( gene_features_file ) print 'loaded %i features' %len(features) # print str(features.values()[1]) sequences = [] for feature in features.values(): location = feature.location() # print location, location.contig, distance, feature.id() if from_end: sequences.append( ( feature.id(), st.extract_downstream(contig_dict[location.contig], location, distance)[1] ) ) else: sequences.append( ( feature.id(), st.extract_upstream(contig_dict[location.contig], location, distance)[1] ) ) # print sequences[feature.id()] outf = open(outfile,'w') if fasta: st.write_sequences_to_fasta_file(outf,sequences) else: sep = ',' for id, seq in sequences: outf.write( '%s%s%s\n' %(id,sep,seq) ) outf.close()
def make_sequences( genome_fasta_file, gene_features_file, outfile='sequences.csv', distance={'upstream':300,'downstream':100}, from_end=False, fasta=False ): if from_end: distance = ( distance['upstream'], distance['downstream'] ) else: '''WARNING: as of 2012-03-22, the st.extract functions used flipped distances! e.g. distance[1] is the UPSTREAM distance and distance[0] is the DOWNSTREAM CHECK YOUR SEQUENCES after running this! Also, a negative number is expected for DOWNSTREAM. So, (-100,300) must be passed to st.extract_upstream in order to get a sequence from 300 upstream to 100 downstream. WEIRD!''' distance = (-1*distance['downstream'],distance['upstream']) contig_sequences = st.read_sequences_from_fasta_file( genome_fasta_file ) # convert contig_sequences to dictionary (this func returns a list of tuples) contig_dict = {} for name, seq in contig_sequences: contig_dict[name] = seq print 'loaded %i contigs' %len(contig_dict) print string.join( [ '%s: %ibp' %(a,len(b)) for a,b in contig_dict.items()] , ',' ) features = st.read_features_from_file( gene_features_file ) print 'loaded %i features' %len(features) # print str(features.values()[1]) sequences = [] for feature in features.values(): location = feature.location # print location, location.contig, distance, feature.id if from_end: sequences.append( ( feature.id, st.extract_downstream(contig_dict[location.contig], location, distance)[1] ) ) else: sequences.append( ( feature.id, st.extract_upstream(contig_dict[location.contig], location, distance)[1] ) ) # print sequences[feature.id] outf = open(outfile,'w') if fasta: st.write_sequences_to_fasta_file(outf,sequences) else: sep = ',' for id, seq in sequences: outf.write( '%s%s%s\n' %(id,sep,seq) ) outf.close()
htmlfile.write('<h3>Interesting motifs (not highest-ranking) can also be:</h3>') has_advice = False if len(buckets[i]) > 0: pat_index = 0 seekpat = buckets[i][pat_index] if seekpat.score > 0: pat_index += 1 if seekpat.usable(): output_pattern(htmlfile, seekpat, seqs, reverse) htmlfile.write("</body></html>") if __name__ == '__main__': print "Adviser Python" if len(sys.argv) <= 1: print "usage: python adviser.py <fasta-file> [S]" else: basename = sys.argv[1] reverse = False print "Processing '%s'" % basename seqs = st.read_sequences_from_fasta_file(basename) mixfile = '%s.mix' % basename print "Running adviser on '%s'" % mixfile mix_entries = read_mixfile(mixfile) # seems correct buckets = make_buckets(mix_entries) # seems correct compute_pattern_relationships(buckets, reverse) # seems correct search_patterns(buckets, seqs, reverse) # fixed output_results(seqs, mix_entries, buckets, sys.argv[1], reverse)