def test_sequence_to_fastq_kwargs_passed(self): for constructor in [Sequence, DNA, RNA, Protein]: for components, kwargs_expected_fp in self.valid_files: for expected_kwargs, expected_fp in kwargs_expected_fp: observed_kwargs = {} # TODO: # some of the test files contain characters which are # invalid for RNA, so don't validate for now. Need to # fix this if constructor is RNA: observed_kwargs['validate'] = False expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' fh = io.StringIO() for c in components: obj = constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': c[3]}, **observed_kwargs) write(obj, into=fh, format='fastq', **expected_kwargs) observed = fh.getvalue() fh.close() with io.open(expected_fp) as f: expected = f.read() self.assertEqual(observed, expected)
def _filter_sequence_ids(in_fp, out_fp, ids, negate=False): '''Filter away the seq with specified IDs.''' with open(out_fp, 'w') as out: for seq in read(in_fp, format='fasta', constructor=Sequence): seq_id = seq.metadata['id'] if seq_id not in ids: write(seq, format='fasta', into=out)
def create_faa(seqs, out, genetic_code=11): '''Create protein sequence file. It creates protein sequences based on the interval features with type of "CDS". Parameters ---------- seqs : iterable of ``Sequence`` The list of DNA/RNA sequences out : file object File object for output genetic_code : int The fallback genetic code to use ''' for seq in seqs: for cds in seq.interval_metadata.query(metadata={'type': 'CDS'}): fna = DNA.concat([seq[start:end] for start, end in cds.bounds]) if cds.metadata.get('strand', '.') == '-': fna = fna.reverse_complement() try: # if translation table is not available in metadata, fallback # to what is specified in the func parameter faa = fna.translate( cds.metadata.get('transl_table', genetic_code)) faa.metadata['description'] = cds.metadata.get('product', '') # CDS metadata must have key of 'ID' faa.metadata['id'] = cds.metadata['ID'] write(faa, into=out, format='fasta') except NotImplementedError: logger.warning( 'This gene has degenerate nucleotide and will not be translated.' )
def test_annotate(self): config = { 'structural_annotation': { 'minced': { 'params': '', 'priority': 50, 'output': 'minced', 'threads': 1 }, 'prodigal': { 'params': '-p meta -f gff', 'priority': 90, 'output': 'prodigal', 'threads': 1 } }, 'protein': {}, 'bacteria': {}, 'general': { 'metadata': 'foo.sqlite' } } config_fp = join(self.tmpd, 'config.yaml') with open(config_fp, 'w') as f: yaml.dump(config, f, default_flow_style=True) write(DNA('ATGC', {'id': 'seq1'}), into=self.i, format='fasta') annotate(self.i, 'fasta', 1, self.tmpd, 'gff3', 11, 'bacteria', 'metagenome', (), 1, True, False, True, config_fp) output = join(self.tmpd, splitext(self.i)[0]) self.assertTrue(exists(output + '.fna')) self.assertTrue(exists(output + '.gff3'))
def scan_seq(seq, db, cpu=1, params=None): if params is None: params = {} params['--cpu'] = cpu app = CMScan(InputHandler='_input_as_paths', params=params) with NamedTemporaryFile(mode='w+') as i: write(seq, into=i.name, format='fasta') return app([db, i.name])
def test_filter_partial_genes(self): in_fp = join(self.tmpd, 'in.gff') out_fp = join(self.tmpd, 'out.gff') imd1 = IntervalMetadata(None) imd1.add( [(0, 100)], metadata={ 'partial': '01', 'phase': 0, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd2 = IntervalMetadata(None) imd2.add( [(200, 300)], metadata={ 'partial': '10', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '-', 'type': 'CDS', 'score': '1' }) imd2.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd3 = IntervalMetadata(None) imd3.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) data = (('seq1', imd1), ('seq2', imd2)) write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3') filter_partial_genes(in_fp, out_fp) obs = read(out_fp, format='gff3') for i, j in zip(obs, [('seq2', imd3)]): self.assertEqual(i, j)
def filter_alignment_positions(aligned_sequences_file: AlignedDNAFASTAFormat, maximum_gap_frequency: str, maximum_position_entropy: str) -> \ AlignedDNAFASTAFormat: aligned_sequences_fh = aligned_sequences_file.open() fasta_file = AlignedDNAFASTAFormat() skbio.write(filter_positions(aligned_sequences_fh, maximum_gap_frequency, maximum_position_entropy), into=str(fasta_file), format='fasta') return fasta_file
def convert(in_f, in_fmt, out_f, out_fmt): '''convert between file formats Parameters ---------- in_fmt : str input file format out_fmt : str output file format in_f : str input file path out_f: str output file path ''' for obj in read(in_f, format=in_fmt): write(obj, format=out_fmt, into=out_f)
def extract_fungi( aligned_silva_file: AlignedDNAFASTAFormat, accession_file: SilvaAccessionFormat, taxonomy_file: SilvaTaxonomyFormat, ) -> AlignedDNAFASTAFormat: aligned_silva_fh = aligned_silva_file.open() accession_fh = accession_file.open() taxonomy_fh = taxonomy_file.open() fasta_file = AlignedDNAFASTAFormat() skbio.write(fungi_from_fasta(aligned_silva_fh, accession_fh, taxonomy_fh), into=str(fasta_file), format='fasta') # TODO this code is a good example of pithy return for plugins # TODO redo other functions in the same way by instantiating a return fasta_file
def test_sequence_to_fastq_kwargs_passed(self): for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]: for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: fh = StringIO() for c in components: obj = constructor(c[2], id=c[0], description=c[1], quality=c[3]) write(obj, into=fh, format='fastq', **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def setUp(self): self.test_dir = abspath( join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100')) files = [ 'Swiss-Prot_Archaea.fna', 'Swiss-Prot_Bacteria.fna', 'Swiss-Prot_Eukaryota.fna', 'Swiss-Prot_Viruses.fna', 'TrEMBL_Archaea.fna', 'TrEMBL_Bacteria.fna', 'TrEMBL_Eukaryota.fna', 'TrEMBL_Viruses.fna' ] files = [join(self.test_dir, f) for f in files] self.tmp = mkdtemp() self.test1 = join(self.tmp, 'test1.fna') self.test1_exp = 'test1.genbank' with open(self.test1, 'w') as f: for seq in read(files[1], format='fasta'): write(seq, format='fasta', into=f) self.obs_tmp = mkdtemp()
def setUp(self): self.test_dir = abspath( join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100')) files = [ 'Swiss-Prot_Archaea.fna', 'Swiss-Prot_Bacteria.fna', 'Swiss-Prot_Eukaryota.fna', 'Swiss-Prot_Viruses.fna', 'TrEMBL_Archaea.fna', 'TrEMBL_Bacteria.fna', 'TrEMBL_Eukaryota.fna', 'TrEMBL_Viruses.fna'] files = [join(self.test_dir, f) for f in files] self.tmp = mkdtemp() self.test1 = join(self.tmp, 'test1.fna') self.test1_exp = 'test1.genbank' with open(self.test1, 'w') as f: for seq in read(files[1], format='fasta'): write(seq, format='fasta', into=f) self.obs_tmp = mkdtemp()
def test_sequence_to_fastq_kwargs_passed(self): for constructor in [Sequence, partial(DNA, validate=False), partial(RNA, validate=False), partial(Protein, validate=False)]: for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: fh = StringIO() for c in components: obj = constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': c[3]}) write(obj, into=fh, format='fastq', **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def test_sequence_to_fastq_kwargs_passed(self): for constructor in [ BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence ]: for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: fh = StringIO() for c in components: obj = constructor(c[2], id=c[0], description=c[1], quality=c[3]) write(obj, into=fh, format='fastq', **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def test_filter_sequence_ids(self): seqs = [ Sequence('A', { 'id': 'seq1', 'description': '' }), Sequence('T', { 'id': 'seq2', 'description': '' }) ] ifile = join(self.tmpd, 'in.fna') write((i for i in seqs), into=ifile, format='fasta') ofile = join(self.tmpd, 'out.fna') idss = [('foo'), {'seq1'}, ('seq2'), {'seq1', 'seq2'}] exps = [seqs, seqs[1:], seqs[:-1], []] for ids, exp in zip(idss, exps): _filter_sequence_ids(ifile, ofile, ids) obs = list(read(ofile, constructor=Sequence, format='fasta')) self.assertEqual(obs, exp)
def pick_otus(file_path): outdir = os.path.join(os.path.dirname(file_path), 'uclust') if False: ## Making fasta format compatible with qiime (for some reason not working, assume user provides it) import skbio ## Im using scikit-bio for fasta I/O (comes with qiime) from skbio.sequence import BiologicalSequence print "Preprocessing FASTA " + file_path file_path1 = '%s_1%s' % tuple(os.path.splitext(file_path)) outfile = open(file_path1, "w") ## reformatting fasta fastafile = skbio.read(file_path, format='fasta') print "Reading " + file_path print "File handle: " + str(fastafile) for seqcount, rec in enumerate(fastafile): print seqcount + rec.__repr__() try: int( rec.id.split('_')[1] ) ## if the sequence adheres to qiime's expected format <sample_id>_<seq_counter> skbio.write(rec, 'fasta', outfile) ## write down the record as is except ValueError, IndexError: ## else: enforce an id format compatible with qiime's otu picker rec1 = BiologicalSequence(rec.sequence, "User_%05d" % seqcount) skbio.write(rec1, 'fasta', outfile) outfile.close() file_path = file_path1
def convert(in_fmt, out_fmt, in_f, out_f): '''convert between file formats''' for obj in read(in_f, format=in_fmt): write(obj, format=out_fmt, into=out_f)
def integrate(seq_fp, annot_dir, protein_xref, out_fp, quality=False, out_fmt='gff3'): '''integrate all the annotations and write to disk. Parameters ---------- seq_fn : str input seq file name. out_dir : str annotation output directory. out_fmt : str output format Returns ------- dict key is the str of seq_id and ``Sequence`` objects ''' logger.info('Integrate annotation for output') seqs = {} for seq in read(seq_fp, format='fasta'): seqs[seq.metadata['id']] = seq rules = { splitext(f)[0] for f in os.listdir(annot_dir) if f.endswith('.ok') } if 'diamond' in rules: rules.discard('diamond') mod = import_module('.diamond', module.__name__) diamond = mod.Module(directory=annot_dir) diamond.parse(metadata=protein_xref) protein = diamond.result else: protein = {} for rule in rules: logger.debug('parse the result from %s output' % rule) mod = import_module('.%s' % rule, module.__name__) obj = mod.Module(directory=annot_dir) obj.parse() for seq_id, imd in obj.result.items(): seq = seqs[seq_id] imd._upper_bound = len(seq) if rule == 'prodigal': cds_metadata = protein.get(seq_id, {}) _add_cds_metadata(seq_id, imd, cds_metadata) seq.interval_metadata.merge(imd) # write out the annotation if out_fmt == 'genbank': with open(out_fp, 'w') as out: for sid, seq in seqs.items(): seq.metadata['LOCUS'] = { 'locus_name': sid, 'size': len(seq), 'unit': 'bp', 'mol_type': 'DNA', 'shape': 'linear', 'division': None, 'date': strftime("%d-%b-%Y", gmtime()) } seq.metadata['ACCESSION'] = '' seq.metadata['VERSION'] = '' seq.metadata['KEYWORDS'] = '.' seq.metadata['SOURCE'] = { 'ORGANISM': 'genus species', 'taxonomy': 'unknown' } seq.metadata['COMMENT'] = 'Annotated with %s %s' % ( __package__, __version__) write(seq, into=out, format=out_fmt) elif out_fmt == 'gff3': write(((sid, seq.interval_metadata) for sid, seq in seqs.items()), into=out_fp, format=out_fmt) else: raise ValueError('Unknown specified output format: %r' % out_fmt) return seqs
def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str, r_primer: str, trunc_len: int = 0, trim_left: int = 0, identity: float = 0.8, min_length: int = 50, max_length: int = 0, n_jobs: int = 1, batch_size: int = 'auto') -> DNAFASTAFormat: """Extract the read selected by a primer or primer pair. Only sequences which match the primers at greater than the specified identity are returned Parameters ---------- sequences : DNASequencesDirectoryFormat An aligned list of skbio.sequence.DNA query sequences f_primer : skbio.sequence.DNA Forward primer sequence r_primer : skbio.sequence.DNA Reverse primer sequence trunc_len : int, optional Read is cut to trunc_len if trunc_len is positive. Applied before trim_left. trim_left : int, optional `trim_left` nucleotides are removed from the 5' end if trim_left is positive. Applied after trunc_len. identity : float, optional Minimum combined primer match identity threshold. Default: 0.8 min_length: int, optional Minimum amplicon length. Shorter amplicons are discarded. Default: 50 max_length: int, optional Maximum amplicon length. Longer amplicons are discarded. n_jobs: int, optional Number of seperate processes to break the task into. batch_size: int, optional Number of samples to be processed in one batch. Returns ------- q2_types.DNAFASTAFormat containing the reads """ if min_length > trunc_len - trim_left and trunc_len > 0: raise ValueError('The minimum length setting is greater than the ' 'length of the truncated sequences. This will cause ' 'all sequences to be removed from the dataset. To ' 'proceed, set a min_length ≤ trunc_len - trim_left.') n_jobs = effective_n_jobs(n_jobs) if batch_size == 'auto': batch_size = _autotune_reads_per_batch( sequences.file.view(DNAFASTAFormat), n_jobs) sequences = sequences.file.view(DNAIterator) ff = DNAFASTAFormat() with open(str(ff), 'a') as fh: with Parallel(n_jobs) as parallel: for chunk in _chunks(sequences, batch_size): amplicons = parallel(delayed(_gen_reads)(sequence, f_primer, r_primer, trunc_len, trim_left, identity, min_length, max_length) for sequence in chunk) for amplicon in amplicons: if amplicon is not None: skbio.write(amplicon, format='fasta', into=fh) if os.stat(str(ff)).st_size == 0: raise RuntimeError("No matches found") return ff
def scan_seq(seq, db, cpu=1, **kwargs): cmscan = Dumpling('cmscan', params=Parameters(*_params)) with NamedTemporaryFile(mode='w+') as i: write(seq, into=i.name, format='fasta') cmscan.update(query=i.name, db=db, **kwargs) return cmscan()
def annotate(in_fp, in_fmt, min_len, out_dir, out_fmt, gcode, kingdom, mode, task, cpus, force, dry_run, quality, config): '''Annotate the sequences in the input file. Parameters ---------- in_fp : str Input seq file name in_fmt : str Input file format min_len : int The threshold of seq length to be filtered away out_dir : str Output file directory. out_fmt : str Output file format gcode : int The translation table to use for protein-coding genes mode : bool Run with metagenomic mode? kingdom : str The kingdom where the sequences are from cpus : int Number of cpus to use. force : bool Force to overwrite. dry_run : bool config : config file for snakemake ''' logger.debug('working dir: %s' % out_dir) if force: logger.debug('run in force mode - will overwrite existing files.') if dry_run: logger.debug('run in dry mode - will not produce output.') ## prepare the file paths os.makedirs(out_dir, exist_ok=True) prefix, suffix = splitext(basename(in_fp)) if suffix in {'.gz', '.bz2'}: prefix = splitext(prefix)[0] out_prefix = join(out_dir, prefix) seq_fp = abspath(out_prefix + '.fna') ## validate and filter the input seq file if exists(seq_fp): # do not overwrite because all the snakemake steps will be rerun when # this file is updated. logger.debug( 'the filtered sequence file already exists. skip validating step.') else: ids = set() with open(seq_fp, 'w') as out: for seq in check_seq(in_fp, in_fmt, lambda s: len(s) < min_len): write(seq, format='fasta', into=out) ## prepare snakemake workflow snakefile = resource_filename(__package__, 'Snakefile') if config is None: config = resource_filename(__package__, kingdom + '.yaml') logger.debug('set annotation in %s mode.' % mode) logger.debug('set annotation as %s.' % kingdom) logger.debug('use config file: %s.' % config) with open(config) as fh: cfg = yaml.load(fh) general = cfg.pop('general', {}) rules = {} if not task: task = [i for i in cfg] for k, v in cfg.items(): # specify the annotation task if k in task: if v is not None: for vk, vv in v.items(): if vk in rules: raise ValueError( 'You have multiple config for rule %s' % vk) rules[vk] = vv ## update the parameters of relevant tools with options from cmd line if 'prodigal' in rules: param = '%s -g %d' % (rules['prodigal']['params'], gcode) if mode == 'finished': param = '-p single -c ' + param elif mode == 'draft': param = '-p single ' + param elif mode == 'metagenome': param = '-p meta ' + param rules['prodigal']['params'] = param if 'aragorn' in rules: rules['aragorn']['params'] = '%s -gc%d' % (rules['aragorn']['params'], gcode) if 'rnammer' in rules: rules['rnammer']['params'] = '-S %s %s' % (kingdom[:3], rules['rnammer']['params']) # only run the targets specified in the yaml file targets = list(rules.keys()) if not targets: logger.warning('No annotation task to run') return rules['seq'] = seq_fp cfg_file = join(out_dir, 'snakemake.yaml') with open(cfg_file, 'w') as out: yaml.dump(rules, out, default_flow_style=False) logger.debug('run snakemake workflow') success = snakemake( snakefile, cores=cpus, targets=targets, # set work dir to output dir so simultaneous runs # doesn't interfere with each other. workdir=out_prefix, printshellcmds=True, dryrun=dry_run, forceall=force, # config=cfg, configfile=cfg_file, keep_target_files=True, # provide this dummy to suppress unnecessary log log_handler=lambda s: None, quiet=True, # do not print job info keep_logger=False) if success: # if snakemake finishes successfully out_fp = '%s.%s' % (out_prefix, out_fmt) protein_xref = general.get('protein_xref') if protein_xref is not None: protein_xref = expanduser(protein_xref) seqs = integrate(seq_fp, out_prefix, protein_xref, out_fp, out_fmt) logger.info('Write summary of the annotation') with open(out_prefix + '.summary.txt', 'w') as out: summarize(seqs.values(), out) if mode != 'metagenome' and quality is True: with open(out_prefix + '.quality.txt', 'w') as out: if mode == 'finish': contigs = False else: contigs = True seq_score = compute_seq_score(seqs.values(), contigs) trna_score = rrna_score = gene_score = np.nan if 'tRNA' in task: trna_score = compute_trna_score( (i.interval_metadata for i in seqs.values())) if 'rRNA' in task: rrna_score = compute_rrna_score( (i.interval_metadata for i in seqs.values())) if 'CDS' in task: gene_score = compute_gene_score(faa_fp) out.write( '# seq_score: %.2f tRNA_score: %.2f rRNA_score: %.2f gene_score: %.2f\n' % (seq_score, trna_score, rrna_score, gene_score)) else: logger.error('The snakemake run failed.') logger.info('Done with annotation')
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} ghost_tree_output = str(ghost_tree_fp) ghost_tree_output = ghost_tree_output[16:-4] process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w") extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta", logfile) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() logfile.write("FastTree warnings for genus "+key_node+" are:\n" + error + "\n") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_fp.write(str(foundation_tree)) logfile.close() return str(foundation_tree).strip()
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ os.system("mkdir tmp") global foundation_accession_genus_dic foundation_accession_genus_dic = {} global seqs extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta") seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) os.system("muscle -in tmp/mini_seq_gt.fasta -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1") os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.children[:]) except: continue os.system("rm -r tmp") ghost_tree_fp.write(str(foundation_tree)) return str(foundation_tree).strip()
def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fp : folder Output folder contains files including: a) The Newick formatted ghost-tree, which is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. b) Accession IDs from the ghost-tree.nwk file that you can use for downstream analyses tools c) log error file (this is an optional file that you can have if you type '--stderr') """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} std_output, std_error = "", "" process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() if re.search("command not found", std_error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() std_output, std_error = "", "" if re.search("command not found", std_error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") os.mkdir(ghost_tree_fp) extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree, all_std_error = _make_foundation_tree( ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error, ghost_tree_fp) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen( "muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n" mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w") ghost_tree_nwk.write(str(foundation_tree)) ghost_tree_nwk.close() _make_accession_id_file(ghost_tree_fp) return str(foundation_tree).strip(), all_std_error
os.mkdir('out/') for label in os.listdir('../iqtree_GTR/out/'): trees = [] for file in filter(lambda x: x.endswith('.treefile'), os.listdir(f'../iqtree_GTR/out/{label}/')): tree = skbio.read(f'../iqtree_GTR/out/{label}/{file}', 'newick', skbio.TreeNode) outgroup = tree.find('sleb').ancestors()[0] tree = tree.root_at(outgroup) trees.append(tree) ctree = majority_consensus(trees) for node in ctree.traverse(): node.children = sorted(node.children, key=lambda x: len(list(x.tips()))) skbio.write(ctree, 'newick', f'out/{label}.txt') # Save image as PNG fig, ax = plot_tree(ctree, tip_fontsize=8.5) ax.yaxis.set_visible(False) ax.spines['left'].set_visible(False) ax.set_xlabel('') plt.savefig(f'out/{label}.png') plt.close() # Save image as with supports for node in ctree.traverse(): if node.support == 1: node.support = None fig, ax = plot_tree(ctree, tip_fontsize=8.5,