def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): """Context manager yielding a temporary reference package for a collection of aligned sequences. Builds a tree with FastTree, creates a reference package, yields. """ sequences = list(sequences) with ntf(prefix='fasttree-', suffix='.log') as log_fp, \ ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \ tempdir(prefix='refpkg') as refpkg_dir: log_fp.close() fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True, threads=threads) tree_fp.close() rp = Refpkg(refpkg_dir(name), create=True) rp.update_metadata('locus', '') rp.update_phylo_model('FastTree', log_fp.name) rp.update_file('tree', tree_fp.name) # FASTA and Stockholm alignment with ntf(suffix='.fasta') as f: SeqIO.write(sequences, f, 'fasta') f.close() rp.update_file('aln_fasta', f.name) with ntf(suffix='.sto') as f: SeqIO.write(sequences, f, 'stockholm') f.close() rp.update_file('aln_sto', f.name) logging.debug("Reference package written to %s", rp.path) yield rp
def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy, dest='index.refpkg', **meta): """ Build an index.refpkg from a set of hrefpkgs """ # Clear taxonomy taxonomy = copy.deepcopy(taxonomy) for node in taxonomy: node.sequence_ids = set() def sequence_names(f): with open(f) as fp: r = csv.DictReader(fp) for i in r: yield i['seqname'] hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths) seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs) # Add seqinfo for f in seqinfo_files: with f: taxonomy.populate_from_seqinfo(f) # Remove lineages without sequences taxonomy.prune_unrepresented() sequence_ids = frozenset(taxonomy.subtree_sequence_ids()) with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \ util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \ util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp: wrap.esl_sfetch(sequence_file, sequence_ids, tf) tf.close() # Seqinfo file r = (i for i in seqinfo if i['seqname'] in sequence_ids) w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) w.writeheader() w.writerows(r) seq_info_fp.close() taxonomy.write_taxtable(tax_fp) tax_fp.close() rp = Refpkg(dest, create=True) rp.start_transaction() rp.update_file('aln_fasta', tf.name) rp.update_file('seq_info', seq_info_fp.name) rp.update_file('taxonomy', tax_fp.name) rp.update_file('profile', wrap.CM) for k, v in meta.items(): rp.update_metadata(k, v) rp.commit_transaction() return rp, sequence_ids