def distance_comparison(dataframe, data_dir, test_name, samples=10000): simulated_dir = join(data_dir, test_name) for index, data in dataframe.iterrows(): lengths = Counter() inner = [] outer = [] trainsets = glob(join(simulated_dir, index + '*', 'ref_seqs.fasta')) testsets = glob(join(simulated_dir, index + '*', 'query.fasta')) for train_fp, test_fp in zip(trainsets, testsets): train = list(map(str, io.read(train_fp, format='fasta'))) test = list(map(str, io.read(test_fp, format='fasta'))) if not train or not test: continue lengths.update(map(len, test)) inner.extend(distances(train, train, samples)) outer.extend(distances(train, test, samples)) inner.sort() outer.sort() df = pd.DataFrame({'train/train': inner, 'train/test': outer}) plt.figure() # figsize=(width, height)) ax = regplot('train/train', 'train/test', df, fit_reg=False) ax.set_title(index, fontsize=20) maxval = max((inner[-1], outer[-1])) plt.plot([0, maxval], [0, maxval], linewidth=2) plt.show()
def main(): parser = argparse.ArgumentParser( description= 'This script will write out sequences based on \n' 'sequence identifiers in a label file. ', formatter_class=RawTextHelpFormatter) req = parser.add_argument_group('REQUIRED') req.add_argument('-i', '--input_fasta', required=True, action='store', help='Input fasta file.') req.add_argument('-l', '--input_sequence_labels', required=True, action='store', help='File in which the first item in each line is' ' a sequence label / identifier.') req.add_argument('-o', '--output_fasta', required=True, action='store', help='Output fasta file.') optp = parser.add_argument_group('OPTIONAL') optp.add_argument('-d', '--include_description', action='store_true', help='Boolean. Keep the additional FASTA header ' 'description text.[Default: False]') optp.add_argument('-r', '--remove_ids', action='store_true', help='Boolean. Remove sequences with the corresponding ' 'IDs, rather than keep. [Default: False]') p = parser.parse_args() input_fasta = read(p.input_fasta, format='fasta') input_labels = open(p.input_sequence_labels, 'U') output_fasta = open(p.output_fasta, 'w') remove_ids = p.remove_ids include_description = p.include_description seq_labels = parse_labels(input_labels) filter_seqs(input_fasta, output_fasta, seq_labels, remove_ids=remove_ids, desc=include_description) input_fasta.close() output_fasta.close()
def test_write_genes(self): genes = sample_genes(self.ortho_groups_fp, min_taxa_cutoff=4.0) write_genes(genes, self.ref_faa_dir, self.out_fa_dir, self.out_genes_fp) # test number of output FASTA files obs = sorted(listdir(self.out_fa_dir)) exp = sorted(['%s.fa' % ogid for ogid in genes]) self.assertListEqual(obs, exp) # test output gene list content with open(self.out_genes_fp, 'r') as f: obs = f.read() with open(self.write_genes_list, 'r') as f: exp = f.read() self.assertEqual(obs, exp) # test FASTA file content for seq in io.read(join(self.out_fa_dir, 'OG0000017.fa'), format='fasta'): exp = 'GCF_000160655.1|WP_040356123.1' self.assertEqual(seq.metadata['id'], exp) exp = ('MYRKHYAADVTETLDGQTVQVAGWVHRRRDHGGVIFIDLRDRSGLVQIVIDPDTADAF' 'ALAEQVRNEYCLAIEGRVRLRPAGTENPDLASGKIEILGKQLTVLSKSEPLPFQLDED' 'NVSEEIRLKHRTIDLRRDVMQKNLILRSKVAASLRRYLDEHGFMDIETPMLTKATPEG' 'ARDYLVPSRTHPGKFFALPQSPQLFKQMLMMSGFDRYYQIVRCFRDEDLRADRQPEFT' 'QLDIETSFLEEEDILQIMEPMIRGIFKEHLGVELANPFPRMTYREAMRRYASDKPDLR' 'IPLELVDIDDLVKNSGFKVFASVAAQDNGRVVALKIPGGAKLTRKEIDDYTAYVARYG' 'AKGLAYIKVNDATNVEGLQSPIVKFLTTEGGAEGAIALDIIKRVDAQNGDLIFFGADK' 'ASIVNDAIGALRIKVGHDLNMLTCDWAPLWVVDFPMFEYDEKDGRWYSMHHPFTQPKT' 'ANLDELDTNPGDVLSRAYDMVLNGTEIGGGSIRIHRDDMQQRVFKSLGIGAEEAQEKF' 'GFLLNALKYGCPPHGGIAFGLDRLIMLMAGAKSIRDVMAFPKTQTAWCPLTDAPSEAS' 'EAQLRELHIRKRQVEKSE') self.assertEqual(str(seq), exp) break
def count_seq(filename): '''Count seq number in the file. The file can be gzipped. ''' for i, s in enumerate(read(filename, format='fasta'), 1): pass return i
def mv_seq(seq, opath, name_dict): seq = read(seq, format='fasta') with open(opath, 'w') as f1: for i in seq: pre_name = i.metadata['id'] i.metadata['id'] = name_dict[pre_name] i.metadata['description'] = '' write(i, 'fasta', f1)
def test_compute_gene_score(self): seqs = get_data_path('pfam.faa') for number, exp in [(1, 0.1), (102, 1)]: with NamedTemporaryFile() as faa: for i, seq in enumerate(read(seqs, format='fasta')): if i == number: break write(seq, into=faa, format='fasta') faa.flush() obs = compute_gene_score(faa.name) self.assertEqual(obs, exp)
def loadRefSeqs(seqsDb, taxRef): reference_db = [] for e in read(seqsDb, format='fasta', constructor=DNA): if e.has_degenerates(): # For the purpose of this lesson, we're going to ignore sequences that contain # degenerate characters (i.e., characters other than A, C, G, or T) continue seq_tax = taxRef[e.metadata['id']] e.metadata['taxonomy'] = seq_tax reference_db.append(e) return reference_db
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", help="location of biom table or fasta file") parser.add_argument("-o", "--output_file", help="location of output biom table or fasta file") parser.add_argument( "-f", "--pynast_fasta", help="location of pynast failures fasta file to be removed") args = parser.parse_args() ids_to_toss = set() if os.stat(args.pynast_fasta).st_size != 0: ids_to_toss = set([ i.id for i in read("pynast_aligned_seqs/rep_set_failures.fasta", format='fasta') ]) if args.input_file.endswith(".biom"): table = load_table(args.input_file) set_to_toss = set(table.ids(axis="observation")) & ids_to_toss table.filter(set_to_toss, invert=True, axis="observation", inplace=True) table.to_json("remove_pynast_failures.py", open(args.output_file, 'w')) elif args.input_file.endswith(".fasta") or args.input_file.endswith(".fa"): if args.output_file is not None: sys.stdout = open(args.output_file, 'w') for seq in read(args.input_file, format='fasta'): if seq.id not in ids_to_toss: print('>%s\n%s' % (seq.id, str(seq))) if args.output_file is not None: sys.stdout.close() else: raise ValueError("Input file must of type .biom, .fasta or .fa")
def parse(self): '''Parse the annotation and add it to interval metadata. Parameters ---------- fp : str file path from minced prediction ''' self.result = { sid: imd for sid, imd in read(self.files['gff'], format='gff3') }
def main(): parser = argparse.ArgumentParser( description='This script will simply re-write FASTA files ' 'without the description. \nWill also convert all Us to Ts and ' 'optionally convert "." to "-".' 'That is, this: \n' '\t>seq1 H. Sapiens\n' '\tACCGGUUGGCCGUUCAGGGUACAGGUUGGCCGUUCAGGGUAA\n' 'will be output as:\n' '\t>seq1\n' '\tACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA\n' 'Expected to be used with SILVA FASTA files.', formatter_class=RawTextHelpFormatter) req = parser.add_argument_group('REQUIRED') req.add_argument('-i', '--input_fasta', required=True, action='store', help='Input fasta file.') req.add_argument('-o', '--output_fasta', required=True, action='store', help='Output fasta file.') optp = parser.add_argument_group('OPTIONAL') optp.add_argument('-d', '--include_description', action='store_true', help='Boolean. Keep the additional FASTA header ' 'description text.[Default: False]') optp.add_argument('-g', '--convert_to_gap', action='store_true', help='Boolean. Convert "." to "-". [Default: False]') p = parser.parse_args() input_fasta = read(p.input_fasta, format='fasta') output_fasta = open(p.output_fasta, 'w') convert_to_gap = p.convert_to_gap include_description = p.include_description parse_seqs(input_fasta, output_fasta, convg=convert_to_gap, desc=include_description) input_fasta.close() output_fasta.close()
def load_fasta_ids(path): """ Reads sequences from a fasta file and extracts identifiers. Parameters ---------- input_file : str fasta file containing contigs and gene identifiers Returns ------- List of fasta identifiers """ fasta_ids = [seq.metadata['id'] for seq in io.read(path, format='fasta')] return fasta_ids
def main(): parser = argparse.ArgumentParser( description='This script will simply degap FASTA files.\n' 'Optionally without the description and or converting ' 'Us to Ts.\n' 'That is, this: \n' '\t>seq1 H. Sapiens\n' '\t...ACCGGUU---GGCCGUU CAGGGUACAGGUUGGCCGUUCAGGGUAA...\n' 'will be output as:\n' '\t>seq1\n' '\tACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA\n', formatter_class=RawTextHelpFormatter) req = parser.add_argument_group('REQUIRED') req.add_argument('-i', '--input_fasta', required=True, action='store', help='Input fasta file.') req.add_argument('-o', '--output_fasta', required=True, action='store', help='Output fasta file.') optp = parser.add_argument_group('OPTIONAL') optp.add_argument('-d', '--include_description', action='store_true', help='Boolean. Keep the additional FASTA header ' 'description text.[Default: False]') optp.add_argument('-u', '--convert_to_uracil', action='store_true', help='Boolean. Convert "U" to "T". [Default: False]') p = parser.parse_args() input_fasta = read(p.input_fasta, format='fasta') output_fasta = open(p.output_fasta, 'w') convert_to_uracil = p.convert_to_uracil include_description = p.include_description parse_seqs(input_fasta, output_fasta, convu=convert_to_uracil, desc=include_description) input_fasta.close() output_fasta.close()
def align_sequences(seqs): import io from subprocess import run, PIPE from skbio.io import read, write from skbio.sequence import Sequence fasta = 'rational_designs.fa' seqs = (Sequence(x) for x in seqs) write(seqs, format='fasta', into=fasta) clustalo = 'clustalo', '-i', fasta stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout stdout_io = io.StringIO(stdout) msa = read(stdout_io, format='fasta') return [str(x) for x in msa]
def parse_msa_file(infile): """Read sequences from a multiple sequence alignment (MSA) file. Parameters ---------- infile : str file path to input MSA file in A3M format (like FASTA format, but lowercase letters will be dropped) Returns ------- skbio TabularMSA """ seqs = [] for seq in io.read(infile, format='fasta'): seqs.append(Protein(re.sub('[a-z]', '', str(seq)), metadata=seq.metadata)) return TabularMSA(seqs)
def parse_msa_file(infile): """Read sequences from a multiple sequence alignment (MSA) file. Parameters ---------- infile : str file path to input MSA file in A3M format (like FASTA format, but lowercase letters will be dropped) Returns ------- skbio TabularMSA """ seqs = [] for seq in io.read(infile, format='fasta'): seqs.append( Protein(re.sub('[a-z]', '', str(seq)), metadata=seq.metadata)) return TabularMSA(seqs)
def runtime_make_test_data(seqs_in, results_dir, sampling_depths): '''Repeatedly subsample a fasta sequence file at multiple sequence depths to generate query/test data for testing method runtimes. seqs_in: path fasta format reference sequences. results_dir: path Output directory. sampling_depths: list of integers Number of sequences to subsample from seqs. ''' if not exists(results_dir): makedirs(results_dir) seqs = [seq for seq in io.read(seqs_in, format='fasta')] for depth in sampling_depths: subset = sample(seqs, depth) tmpfile = join(results_dir, str(depth)) + '.fna' with open(tmpfile, "w") as output_fasta: for s in subset: s.write(output_fasta, format='fasta')
def main(): parser = argparse.ArgumentParser( description= 'This script will read in a FASTA file and remove ' 'any sequences that have homopolymers and ambiguous base calls.' 'That is, the following sequences would be removed: \n' '\t>seq1-homopolymeric\n' '\tACCGGTTGGCCGTTTTTTTTTCAGGGMACAGGTTVGCCGTTCAGGGTAA\n' '\t>seq2-ambiguos-bases\n' '\tACCGGTTGGCCVTGCCGMMTTCVVAGRGTAY\n', formatter_class=RawTextHelpFormatter) req = parser.add_argument_group('REQUIRED') req.add_argument('-i', '--input_fasta', required=True, action='store', help='Input fasta file.') req.add_argument('-o', '--output_fasta', required=True, action='store', help='Output fasta file.') optp = parser.add_argument_group('OPTIONAL') optp.add_argument('-p', '--n_homopolymer_length', action='store', type=int, default=8, help='Remove sequences that contain homopolymers of ' 'greater than or equal to length n. \n' "[Default %(default)s)]") optp.add_argument('-a', '--n_ambiguous_bases', action='store', type=int, default=5, help='Remove sequences that contain a ' 'number of IUPAC ambiguous bases greater than or equal ' "to length n. \n[Default %(default)s)]") p = parser.parse_args() input_fasta = read(p.input_fasta, format='fasta') output_fasta = open(p.output_fasta, 'w') n_homopolymer_length = p.n_homopolymer_length n_ambiguous_bases = p.n_ambiguous_bases filter_seqs(input_fasta, output_fasta, n_homopolymer_length=n_homopolymer_length, n_ambiguous_bases=n_ambiguous_bases) input_fasta.close() output_fasta.close()
def extract_sequences(infile, seqidx=0): """ Extract sequence(s) from a multi-sequence FASTA file Parameters ---------- infile : str file path to input multi-sequence FASTA file seqidx : int (optional) n-th sequence of the input file to extract (default: 0 for all sequences) Returns ------- list of skbio Sequence """ seqs = [] iseq = 0 # current sequence index for seq in io.read(infile, format='fasta'): iseq += 1 if 0 < seqidx != iseq: continue seqs.append(seq) return seqs
def make_tRNA_table(mature_fa, tRNA_structure, tRNA_fa, tablename, prefix='TR'): tRNA_dict = tRNA_seq_dict(mature_fa) anticodon_dict = anticodon_pos(tRNA_structure) rows = [] for record in read(tRNA_fa, 'fasta'): if record.metadata['id'].startswith(prefix): tRNA_id = tRNA_dict[str(record)] anticodon = record.metadata['id'].split('-')[1] pos, annotated_anticodon, aa = anticodon_dict[tRNA_id].split(',') tRNA_length = len(record) rows.append( (record.metadata['id'], 0, tRNA_length, anticodon, pos, aa, str(record))) df = pd.DataFrame(rows, columns = ['tRNA','start','end','anticodon','anticodon_pos', 'aa', 'seq']) \ .assign(anticodon_start = lambda d: d.anticodon_pos.str.extract('^([0-9]+)-',expand=False).astype(int))\ .assign(anticodon_end = lambda d: d.anticodon_pos.str.extract('-([0-9]+)$',expand=False).astype(int)) \ .assign(predicted_anticodon = lambda d: list(map(lambda x,y,z: x[(y-1):z], d.seq, d.anticodon_start, d.anticodon_end)))\ .to_csv(tablename, sep='\t', index=False) print('Written %s' % tablename)
def parse(self): '''Parse the annotation and add it to interval metadata.''' self.result = { sid: imd for sid, imd in read(self.files['gff'], format='gff3') }
def tRNA_seq_dict(mature_fa): tRNA_dict = {} for r in read(mature_fa, 'fasta'): description = r.metadata['description'].split(' ')[2].strip(')') tRNA_dict[str(r).replace('U', 'T') + 'CCAA'] = description return tRNA_dict
def tax_acc(argv): """Computing accuracy of taxonomic classification across all ranks""" parser = argparse.ArgumentParser() parser.add_argument('lca', type=str, help='aggregated ORF LCA output. See agg-orf command') parser.add_argument( 'taxonomy', type=str, help='the preprocessed GTDB taxonomy file. See prep-meta command') parser.add_argument('fasta', type=str, help='the input Fasta file with genomic sequences') parser.add_argument('-o', '--output', type=str, help='the output file to save results to', default=None) args = parser.parse_args(argv) logger = get_logger() seqs = list() accs = list() for seq in skio.read(args.fasta, format='fasta'): accession, length, seq_name = seq.metadata['id'].split('-') seqs.append(seq_name) accs.append(accession) input_df = pd.DataFrame({'accession': accs, 'seq_name': seqs}) # accession,domain,phylum,class,order,family,genus,species,gtdb_genome_representative logger.info(f'Reading taxonomy from {args.taxonomy}') taxdf = pd.read_csv(args.taxonomy, index_col='accession') ar122 = (taxdf['domain'] == 'd__Archaea').values bac120 = (taxdf['domain'] == 'd__Bacteria').values logger.info( f' - found {ar122.sum()} Archaea genomes and {bac120.sum()} Bacteria genomes' ) # accession,seq_name,domain,phylum,class,order,family,genus,species # GCA_000380905.1,AQYW01000001.1,d__Archaea,p__Nanoarchaeota,c__Nanoarchaeia,o__SCGC-AAA011-G17,f__SCGC-AAA011-G17,g__SCGC-AAA011-G17,s__SCGC-AAA011-G17 sp000402515 logger.info(f'Reading LCA results from {args.lca}') lca_df = pd.read_csv(args.lca) lca_df = input_df.set_index('seq_name').join( lca_df.set_index('seq_name').drop('accession', axis=1)) taxdf = taxdf.filter(lca_df['accession'], axis=0) results = { 'accuracy': list(), 'pclfd': list(), 'bac_accuracy': list(), 'bac_pclfd': list(), 'ar_accuracy': list(), 'ar_pclfd': list() } ar122 = (taxdf['domain'] == 'd__Archaea').values bac120 = (taxdf['domain'] == 'd__Bacteria').values logger.info( f' - found {ar122.sum()} Archaea sequences and {bac120.sum()} Bacteria sequences' ) ar122_tax = taxdf.index[ar122] bac120_tax = taxdf.index[bac120] logger.info( f' - found {len(set(ar122_tax))} Archaea genomes and {len(set(bac120_tax))} Bacteria genomes' ) def get_results(tdf, ldf, col, sub=None): if sub is not None: tdf = tdf.iloc[sub] ldf = ldf.iloc[sub] mask = ldf[col].notna().values true = tdf[col][mask].values pred = ldf[col][mask].values eq = true == pred return mask.mean(), eq.mean() for col in taxlevels[1:]: logger.info(f'computing results for {col}') pclfd, acc = get_results(taxdf, lca_df, col) results['pclfd'].append(pclfd) results['accuracy'].append(acc) pclfd, acc = get_results(taxdf, lca_df, col, sub=bac120) results['bac_pclfd'].append(pclfd) results['bac_accuracy'].append(acc) pclfd, acc = get_results(taxdf, lca_df, col, sub=ar122) results['ar_pclfd'].append(pclfd) results['ar_accuracy'].append(acc) df = pd.DataFrame(data=results, index=taxlevels[1:]) if args.output is not None: df.to_csv(args.output) print(df)
def write_genes(genes, input_faa_dir, output_fa_dir, output_genes_fp): """ Write protein sequences of selected gene families to external files Parameters ---------- genes : dict of dict of set of str { ogid : { taxon : set(protein(s)) } return value of sample_genes or filter_paralogs input_faa_dir : str directory of input protein sequences from the query genome and the selected taxa for comparison (FASTA format, one taxon per file) output_fa_dir : str directory to store output protein sequences (FASTA format, one gene family per file) output_genes_fp : str file to store a list of selected gene families and their members. format: gene1<tab>taxon1|protein1,taxon2|protein2,... """ # generate a taxon to protein to gene family map # so complicated because it is optimized for subsequent filesystem I/O prots = {} for ogid in genes: for taxon in genes[ogid]: if taxon not in prots: prots[taxon] = {} for prot in genes[ogid][taxon]: if prot in prots[taxon]: prots[taxon][prot].add(ogid) else: prots[taxon][prot] = set([ogid]) # match taxa with faa filenames # so complicated because OrthoFinder trims off the version number from # an NCBI-style accession (e.g., GCF_012345.1 becomes GCF_012345) taxon2file = {} for fname in os.listdir(input_faa_dir): if fname.endswith('.faa'): taxon = fname.split('.')[0] if taxon in prots: taxon2file[taxon] = fname # read protein sequences seqs = {} for taxon in taxon2file: for seq in io.read(os.path.join(input_faa_dir, taxon2file[taxon]), format='fasta'): id = seq.metadata['id'] if id in prots[taxon]: seqs[id] = str(seq) # write protein sequences per selected gene family for ogid in sorted(genes): members = [] with open(os.path.join(output_fa_dir, '%s.fa' % ogid), 'w') as f: for taxon in sorted(genes[ogid]): # restore taxon name from OrthoFinder-crippled form trutax = taxon2file[taxon][:-4] for prot in sorted(genes[ogid][taxon]): if prot in seqs: # sequence IDs are like: taxon|protein f.write('>%s|%s\n%s\n' % (trutax, prot, seqs[prot])) members.append('%s|%s' % (trutax, prot)) with open(output_genes_fp, 'a') as f: f.write('%s\t%s\n' % (ogid, ','.join(members)))
if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s - %(message)s') logger = logging.getLogger() logger.info('loading data %s' % args.input) io = get_hdf5io(args.input, 'r') difile = io.read() difile.set_raw() tid = difile.seq_table[args.idx][3][1] sid = difile.seq_table[args.idx][1] fofin = open(args.fof, 'r') for line in map(lambda x: x.strip(), fofin): if tid in line: fasta_file = line break fofin.close() print(sid, tid, fasta_file) for seq in skbio.io.read(fasta_file, constructor=Protein, format='fasta'): if sid == seq.metadata['id']:
import argparse import os import sys from skbio import DNA import skbio.io as skio desc = "fasta_path must be a file path from NCBI and have the genome assembly accession in it" parser = argparse.ArgumentParser(description=desc) parser.add_argument('fasta_path', type=str, help='the fasta file to append the prefix to') args = parser.parse_args() args.prefix = os.path.basename(args.fasta_path)[:15] seqs = list() tmp_fa = sys.stdout w = 100 for seq in skio.read(args.fasta_path, format='fasta', constructor=DNA): seq.metadata['id'] = args.prefix+"-"+str(len(seq))+"-"+seq.metadata['id'] seqs.append(seq) tmp_fa.write('>') tmp_fa.write(seq.metadata['id']) tmp_fa.write('\n') for s in range(0, len(seq), w): tmp_fa.write(''.join(seq.values[s:s+w].astype('U'))) tmp_fa.write('\n')
def mt_tRNA_tab(mature_fa): rows = [] for record in read(mature_fa, 'fasta'): rows.append((record.metadata['id'], str(record), 0, len(str(record)))) return pd.DataFrame(rows, columns=['tRNA', 'seq', 'start', 'end'])
def main(): parser = argparse.ArgumentParser( description='Using a minimum sequence length per taxonomic ' 'group. \nThis script will read in a FASTA file and a taxonomy file, ' '\nany sequence that does not fit the length criteria for a given ' '\ntaxonomic group will be discarded. For example, if the following ' '\ncriteria are specified:\n' '\n\t\'{"d__Bacteria":1200, "d__Archaea":900}\'\n' '\nThis means, any Bacterial and Eukaryal sequences less than 1200 ' '\nbases, and any Archaeal sequences less than 900 bases, will be ' '\ndiscarded.', formatter_class=RawTextHelpFormatter) req = parser.add_argument_group('REQUIRED') req.add_argument('-i', '--input_sequences', required=True, action='store', help='Input fasta file.') req.add_argument('-t', '--input_taxonomy', required=True, action='store', help='Input taxonomy file.') req.add_argument('-o', '--output_sequences', required=True, action='store', help='Output filtered FASTA file.') optp = parser.add_argument_group('OPTIONAL') optp.add_argument( '-g', '--taxonomic_groups', action='store', default='{"d__Bacteria":1200, "d__Archaea":900, "d__Eukaryota":1400}', help='List of taxonomic groups and associated minimum seq ' '\nlength. Any sequences greater than or equal to length n.' '\nTip: set to \'{}\' if you only want to use the ' '\n\'global_length_min\' option.' "\n[Default: \'%(default)s\']") optp.add_argument( '-m', '--global_length_min', action='store', default='1200', type=int, help='Any taxonomic groups not specified, will have their ' '\nsequences discarded if they do not fit this length ' '\ncritera. Set to large value if you want to remove all ' '\nunspecified taxonomic groups.' "groups.\n[Default: %(default)s]") p = parser.parse_args() input_sequences = read(p.input_sequences, format='fasta') output_sequences = open(p.output_sequences, 'w') input_taxonomy = open(p.input_taxonomy, 'U') taxonomic_groups = p.taxonomic_groups global_length_min = p.global_length_min id_taxonomy_dict = make_taxonomy_dict(input_taxonomy) taxonomic_groups_dict = make_tax_group_dict(taxonomic_groups) filter_seqs_by_len_and_tax(input_sequences, output_sequences, taxonomic_groups_dict, id_taxonomy_dict, global_length_min=global_length_min) input_sequences.close() output_sequences.close()
current = t.ids('observation') updated = map(lambda x: x.upper(), current) if len(set(updated)) != len(updated): print('************>', a.id, fp, '<**************') if set(current) ^ set(updated): print('Changing biom: ', a.id, fp) t.update_ids({i: i.upper() for i in t.ids('observation')}, axis='observation', inplace=True) with biom_open(fp, 'w') as f: t.to_hdf5(f, t.generated_by) checksum = compute_checksum(fp) elif fpt == 'preprocessed_fasta': changed = False tmp = fp + '.tmp' with open(tmp, 'w') as out: for seq in read(fp, format='fasta'): seq = str(seq) sequ = seq.upper() out.write('>%s\n%s\n' % (sequ, sequ)) if seq != sequ: changed = True if changed: print('Changing biom: ', a.id, fp) rename(tmp, fp) checksum = compute_checksum(fp) else: remove(tmp) if checksum is not None: TRN.add(sql, [checksum, _id]) TRN.execute()
def extract_sequences(infile, identifiers=None): """Extract sequence(s) from a multi-sequence FASTA file. Parameters ---------- infile : str file path to input multi-sequence FASTA file identifiers : int sequence index (n-th sequence in the file) str sequence ID (name) or index numeric str is treated as index instead of ID comma-separated sequence IDs or indexes file path to sequence list (one ID or index per line) sequence index range as "start..end" (both included) start must be smaller or equal to end list of int sequence indexes list of str sequence IDs or indexes tuple of two int's sequence index range as (start, end) if omitted, all sequences will be extracted Returns ------- list of skbio Sequence extracted protein sequences Raises ------ ValueError if tuple (index range) is not in (start, end) form if index range str is not formatted as "start, end" if the data type of identifiers is incorrect """ l, ids, indexes = [], set(), set() if identifiers: # IDs or indexes as list if isinstance(identifiers, list): l = identifiers # start and end indexes as tuple of int elif isinstance(identifiers, tuple): if len(identifiers) == 2 \ and all(isinstance(n, int) for n in identifiers) \ and 0 < identifiers[0] <= identifiers[1]: l = list(range(identifiers[0], identifiers[1] + 1)) else: raise ValueError('Error: Index range must be a tuple of ' '(start, end).') elif isinstance(identifiers, str): # read from a file if os.path.isfile(identifiers): with open(identifiers, 'r') as f: l = f.read().splitlines() # start and end indexes as str elif '..' in identifiers: l = identifiers.split('..') if len(l) == 2 \ and all(n.isdigit() for n in l) \ and 0 < int(l[0]) <= int(l[1]): l = list(range(int(l[0]), int(l[1]) + 1)) else: raise ValueError('Error: Index range must be formatted as ' '"start..end".') # IDs or indexes as str (single or comma-separated list) else: l = list(map(str.strip, identifiers.split(','))) # index as int elif isinstance(identifiers, int): l = [identifiers] else: raise ValueError('Error: Incorrect data type of identifiers.') for i in l: if isinstance(i, int): # index of this protein in the file indexes.add(i) elif i.isdigit(): indexes.add(int(i)) else: ids.add(i) # protein ID (name) seqs = [] for i, seq in enumerate(io.read(infile, format='fasta')): if ids: if seq.metadata['id'] in ids: seqs.append(seq) elif indexes: if i + 1 in indexes: # indexes start with 1, not 0 seqs.append(seq) else: seqs.append(seq) return seqs
#!/usr/bin/env python import RNA from skbio import io import re import sys if len(sys.argv) != 2: sys.exit('[usage] python %s <fasta file>' % sys.argv[0]) fa = sys.argv[1] for r in io.read(fa, 'fasta'): seq = str(r)[20:-20] f, e = RNA.fold(seq.strip('N')) folded = RNA.b2C(f) is_cloverleave = re.findall('[A-Z]', folded) is_tRNA = is_cloverleave and 'HHH' in ''.join(is_cloverleave) closed_end = folded.startswith('(') and folded.endswith(')') cloverleave = 'cloverleaf' if is_tRNA and closed_end else 'hairpin' print(r.metadata['id'], cloverleave, folded, seq.strip('N'))
def generate_cross_validated_sequences(read_taxa, simulated_reads_fp, index, iterations, cv_dir): '''Generates simulated community files (fasta and taxonomy) as subsets of simulated amplicons/taxa for cross-validated taxonomy assignment. Selects duplicated taxa names, evenly allocates these among subsets as query taxa (test set), generates ref taxa (training set) that do not match query fasta IDs, and creates fasta files to match each of these sets. read_taxa: list or path list or file of taxonomies corresponding to simulated_reads_fp simulated_reads_fp: path simulated amplicon reads (fasta format file) index: str reference database name iterations: int >= 2 number of subsets to create cv_dir: path base output directory to contain simulated datasets ''' if iterations < 2: raise ValueError('Must perform two or more iterations for ' 'construction of cross-validated datasets.') # Stratify the data and form the CV data sets simulated_reads = list(io.read(simulated_reads_fp, format='fasta')) taxonomy = Artifact.import_data('FeatureData[Taxonomy]', read_taxa, view_type='HeaderlessTSVTaxonomyFormat') tree = build_tree(taxonomy, simulated_reads) strata = get_strata(tree, iterations) print(index + ': generating', iterations, 'folds on', len(strata), 'strata') X, y = zip(*[(s, t) for t, ss in strata for s in ss]) skf = StratifiedKFold(n_splits=iterations, shuffle=True, random_state=0) splits = [] for train, test in skf.split(X, y): train_set = {X[i] for i in train} test_set = {X[i] for i in test} splits.append((train_set, test_set)) # Output the CV data sets in the expected formats taxonomy_series = taxonomy.view(pd.Series) for iteration, (train, test) in enumerate(splits): db_iter_dir = join(cv_dir, '{0}-iter{1}'.format(index, iteration)) if not exists(db_iter_dir): makedirs(db_iter_dir) query_taxa_fp = join(db_iter_dir, 'query_taxa.tsv') query_fp = join(db_iter_dir, 'query.fasta') ref_fp = join(db_iter_dir, 'ref_seqs.fasta') ref_taxa_fp = join(db_iter_dir, 'ref_taxa.tsv') # Output the taxa files train_series = taxonomy_series[train] train_series.to_csv(ref_taxa_fp, sep='\t') # If a taxonomy in the test set doesn't exist in the training set, trim # it until it does train_taxonomies = set() for taxonomy in train_series.values: taxonomy = taxonomy.split(';') for level in range(1, len(taxonomy) + 1): train_taxonomies.add(';'.join(taxonomy[:level])) test_list = [] for sid in test: taxonomy = taxonomy_series[sid].split(';') for level in range(len(taxonomy), 0, -1): if ';'.join(taxonomy[:level]) in train_taxonomies: test_list.append('\t'.join( [sid, ';'.join(taxonomy[:level]).strip()])) break else: raise RuntimeError('unknown kingdom in query set') export_list_to_file(test_list, query_taxa_fp) # Output the reference files with open(ref_fp, 'w') as ref_fasta: with open(query_fp, 'w') as query_fasta: for seq in simulated_reads: if seq.metadata['id'] in train: seq.write(ref_fasta, format='fasta') else: seq.write(query_fasta, format='fasta') # Encode as Artifacts for convenience artifact = Artifact.import_data('FeatureData[Sequence]', ref_fp) artifact.save(ref_fp[:-5] + 'qza') artifact = Artifact.import_data('FeatureData[Sequence]', query_fp) artifact.save(query_fp[:-5] + 'qza') artifact = Artifact.import_data( 'FeatureData[Taxonomy]', ref_taxa_fp, view_type='HeaderlessTSVTaxonomyFormat') artifact.save(ref_taxa_fp[:-3] + 'qza')
from skbio.io import read, write seqs = read("example.fna", qual="example.qual", format="fasta") write(seqs, into="example.fastq", variant="illumina1.8", format="fastq")
def summarize_blast6(filename): df = read(filename, format="blast+6", into=pd.DataFrame, default_columns=True) df_best = filter_best(df)
def loadTree(tree): with open(tree, 'r') as f: tree = read(f, format="newick", into=TreeNode) return tree
def extract_sequences(infile, identifiers=None): """Extract sequence(s) from a multi-sequence FASTA file. Parameters ---------- infile : str file path to input multi-sequence FASTA file identifiers : int sequence index (n-th sequence in the file) str sequence ID (name) or index numeric str is treated as index instead of ID comma-separated sequence IDs or indexes file path to sequence list (one ID or index per line) sequence index range as "start..end" (both included) start must be smaller or equal to end list of int sequence indexes list of str sequence IDs or indexes tuple of two int's sequence index range as (start, end) if omitted, all sequences will be extracted Returns ------- list of skbio Sequence extracted protein sequences Raises ------ ValueError if tuple (index range) is not in (start, end) form if index range str is not formatted as "start, end" if the data type of identifiers is incorrect """ l, ids, indexes = [], set(), set() if identifiers: # IDs or indexes as list if isinstance(identifiers, list): l = identifiers # start and end indexes as tuple of int elif isinstance(identifiers, tuple): if len(identifiers) == 2 \ and all(isinstance(n, int) for n in identifiers) \ and 0 < identifiers[0] <= identifiers[1]: l = list(range(identifiers[0], identifiers[1] + 1)) else: raise ValueError('Error: Index range must be a tuple of ' '(start, end).') elif isinstance(identifiers, str): # read from a file if os.path.isfile(identifiers): with open(identifiers, 'r') as f: l = f.read().splitlines() # start and end indexes as str elif '..' in identifiers: l = identifiers.split('..') if len(l) == 2 \ and all(n.isdigit() for n in l) \ and 0 < int(l[0]) <= int(l[1]): l = list(range(int(l[0]), int(l[1]) + 1)) else: raise ValueError('Error: Index range must be formatted as ' '"start..end".') # IDs or indexes as str (single or comma-separated list) else: l = list(map(str.strip, identifiers.split(','))) # index as int elif isinstance(identifiers, int): l = [identifiers] else: raise ValueError('Error: Incorrect data type of identifiers.') for i in l: if isinstance(i, int): # index of this protein in the file indexes.add(i) elif i.isdigit(): indexes.add(int(i)) else: ids.add(i) # protein ID (name) seqs = [] for i, seq in enumerate(io.read(infile, format='fasta')): if ids: if seq.metadata['id'] in ids: seqs.append(seq) elif indexes: if i+1 in indexes: # indexes start with 1, not 0 seqs.append(seq) else: seqs.append(seq) return seqs