def write_as_fasta(self, fh, n=None): """----------------------------------------------------------------------------------------- Write to a file in fasta format, if n is defined, write only the specified ORF in the list :param fh, open filehandle for writing :param n: integer, index of ORF to write, write all if not specified :return: n -----------------------------------------------------------------------------------------""" fasta = Fasta() nwritten = 0 if n is None: # print all ORFS for orf in self.orf: fasta.id = orf['id'] fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \ format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end']) fasta.seq = orf['sequence'] fh.write(fasta.format(linelen=60)) fh.write('\n') nwritten += 1 elif n < len(self.orf): # print the selected ORF orf = self.orf[n] fasta.id = orf['id'] fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \ format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end']) fasta.seq = orf['sequence'] fh.write(fasta.format(linelen=60)) fh.write('\n') nwritten = 1 return nwritten
def getSequence(): sequence = None if request.method == 'POST': if 'file1' in request.files: f = request.files['file1'] sequence = 0 state['seq'][1]['status'] = 'next' elif 'file2' in request.files: f = request.files['file2'] sequence = 1 fasta = Fasta(fh=f) fasta.read() print(fasta.format()) seq = state['seq'][sequence] seq['fasta'] = fasta seq['status'] = 'loaded' # if both sequences have been selected, check whether the sequences are DNA or protein state['params']['seqtype'] = 'protein' if state['seq'][0]['status'] is 'loaded' and state['seq'][1][ 'status'] is 'loaded': if state['seq'][0]['fasta'].isACGT( ) and state['seq'][1]['fasta'].isACGT(): state['params']['seqtype'] = 'DNA' return render_template('dashboard.html', state=state)
n_notmatch[fastafile] = 0 n_file += 1 while fasta.next(): n_sequence[fastafile] += 1 n_total += 1 if fasta.id in idlist or not idlist: # desired selected sequences fasta.trimDocByRegex(trim) seqlen = len(fasta.seq) if args.minlen and seqlen < args.minlen: # skip sequences shorter than minimum length, if specified continue out.write('{}\n'.format(fasta.format(linelen=args.linelen))) n_written += 1 n_match[fastafile] += 1 if fasta.id in n_found: n_found[fasta.id] += 1 else: n_found[fasta.id] = 1 else: # not selected sequence n_notmatch[fastafile] += 1 sys.stderr.write('files read: {}\n'.format(n_file)) sys.stderr.write('total sequences read: {}\n'.format(n_total)) sys.stderr.write('total sequences written: {}\n'.format(n_written))
"""--------------------------------------------------------------------------------------------------------------------- Remove the Trinity path information from the id line usage fasta_reformat.py *.fasta ---------------------------------------------------------------------------------------------------------------------""" import glob import sys import re from sequence.fasta import Fasta linelen = 60 # default target file name target = '*.fasta' if len(sys.argv) > 1: target = sys.argv[1] print(' target file:', target) for fastafile in glob.glob(target): # output file outfile = fastafile + '.reformatted' out = open(outfile, 'w') print(' input file:', fastafile, ' output file:', outfile) fasta = Fasta() fasta.open(fastafile) while fasta.next(): fasta.doc = re.sub(r' path=\[[^]]+\]', '', fasta.doc) out.write(fasta.format(linelen=linelen))
n_uniqueperfile = 0 n_total = 0 n_unique_total = 0 sys.stderr.write('{}\t{}\t\t{}\n'.format('file', 'per file', 'total')) for fastafile in glob.glob(target): fasta = Fasta() fasta.open(fastafile) n_file += 1 n_perfile = 0 while fasta.next(): n_perfile += 1 if fasta.id in unique_seq: continue else: n_uniqueperfile += 1 unique_seq[fasta.id] = fasta.format(linelen=100) n_total += n_perfile n_unique_total += n_uniqueperfile sys.stderr.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(n_file, fastafile, n_perfile, n_uniqueperfile, n_total, n_unique_total)) # write out sequences for seq in unique_seq: sys.stdout.write(unique_seq[seq]) exit(0)
diagonal[d] = filtered return nmatch # -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': print('\ntest 0: identity matching') print('\texpect 7 matches\n') fasta = Fasta() fasta.id = 'test0' fasta.doc = '5 letter DNA test' fasta.seq = 'ACAGT' print('{}\n'.format(fasta.format())) match = Match() match.s1 = fasta match.s2 = fasta nmatch = match.identityPos() print('matches: {}'.format(nmatch)) print('\ntest 1: identity matching, unequal length sequences') print('\texpect 11 matches\n') match = Match() fasta1 = Fasta() fasta1.id = 'test1.1' fasta1.doc = '5 letter DNA test' fasta1.seq = 'ACAGT'
print('in', n_current, 'sequences', end=' ') print('written to', outfilename) except NameError: pass n_out += 1 n_current = 0 base_current = 0 outfilename = '{0}.{1}.{2}'.format(outbase, n_out, outsuffix) outfile = open(outfilename, 'w') n_seq += 1 base_total += fasta.length() n_current += 1 base_current += fasta.length() outfile.write(fasta.format()) # report statistics for last file outfile.close() print(' ', base_current, 'bases/amino acids', end=' ') print('in', n_current, 'sequences', end=' ') print('written to', outfilename) # report overall statistics print('\n') print(base_total, 'characters from', n_seq, 'sequences written to', n_out, 'files\n')
nfeature += 1 elif info['Parent'] in flist: for k in info: if k not in flist[info['Parent']]: flist[info['Parent']][k] = info[k] else: # flist[info['ID']] = info sys.stderr.write('unknown feature {}\n'.format(info['feature'])) # write out sequences for gene in flist: thisgene = flist[gene] f = Fasta() f.id = thisgene['ID'] f.doc = '' for k in save: if k in thisgene: f.doc += ' {}:{}'.format(k, thisgene[k]) f.seq = seq[thisgene['seqname']][thisgene['begin'] - 1:thisgene['end']] if (thisgene['end'] - thisgene['begin'] > 100000): # coordinates cross origin f.seq = seq[thisgene['seqname']][thisgene['end'] - 1:] + seq[ thisgene['seqname']][:thisgene['begin']] if thisgene['strand'] == '-': f.seq = complement(f.seq) sys.stdout.write(f.format(linelen=100)) exit(0)
"phams":["56154"], "Start":15822, "Stop":16230, "Length":408, "Name":"24", "translation":"MTNVFTLDAMREETRKKYQPVKIGLSEDVTVELKPLLKLGKKAREAVADAVKEIEALPDEIDEDDEDSDELMDEVAEKICESIAKVFKLIATSPRKLLAELDTEEEPQIRAELYGAVLRTWMRET QLGEAAPSPN", "Orientation":"F", "Notes":"b'tail assembly chaperone'"} ... Michael Gribskov 10 April 2021 =================================================================================================""" import sys import json from sequence.fasta import Fasta # -------------------------------------------------------------------------------------------------- # main program # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': fp = open(sys.argv[1], 'r') phage = json.load(fp) for gene in phage['results']: f = Fasta() f.id = gene['GeneID'] f.seq = gene['translation'] f.doc = gene['Notes'][2:-1] print(f.format(linelen=100)) exit(0)
base = base.replace('.seq', '') sys.stdout.write('\n\tExpanded file: {}\n\tbasename: {}\n'.format( infilename, base)) outfilename = base + '.fasta' outfile = None try: outfile = open(outfilename, 'w') except: sys.stderr.write( 'Unable to open output file ({})\n'.format(outfilename)) exit(2) # process all sequences in the file n = 0 for seq in infile: fasta = Fasta() fasta.id = base + '_{}'.format(n) fasta.seq = seq.rstrip().upper() fasta.doc = 'length={}'.format(fasta.length()) outfile.write(fasta.format(linelen=100)) n += 1 infile.close() outfile.close() sys.stdout.write('\t{} sequences written to {}\n'.format( n, outfilename)) # end of loop over files exit(0)