def test_input_file(self): with tempfile.NamedTemporaryFile( prefix='graftm_greengenes_tax_testing') as tf: tf.write('seq1\tbacteria;cyanobacteria'.encode()) tf.flush() self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\ GreenGenesTaxonomy.read_file(tf.name).taxonomy)
def __init__(self, tree, taxonomy, seqinfo=None): ''' Parameters ---------- tree : dendropy.Tree dendropy.Tree object taxonomy : string Path to a file containing taxonomy information about the tree, either in Greengenes or taxtastic format (seqinfo file must also be provided if taxonomy is in taxtastic format). seqinfo : string Path to a seqinfo file. This is a .csv file with the first column denoting the sequence name, and the second column, its most resolved taxonomic rank. ''' self.encountered_nodes = {} self.encountered_taxonomies = set() self.tree = tree # Read in taxonomy logging.info("Reading in taxonomy") if seqinfo: logging.info("Importing taxtastic taxonomy from files: %s and %s" % (taxonomy, seqinfo)) gtns = Getaxnseq() self.taxonomy = gtns.read_taxtastic_taxonomy_and_seqinfo( open(taxonomy), open(seqinfo)) else: try: logging.info("Reading Greengenes style taxonomy") self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy except MalformedGreenGenesTaxonomyException: raise Exception("Failed to read taxonomy as a Greengenes \ formatted file. Was a taxtastic style \ taxonomy provided with no seqinfo file?")
def __init__(self, tree, taxonomy, seqinfo=None): ''' Parameters ---------- tree : dendropy.Tree dendropy.Tree object taxonomy : string Path to a file containing taxonomy information about the tree, either in Greengenes or taxtastic format (seqinfo file must also be provided if taxonomy is in taxtastic format). seqinfo : string Path to a seqinfo file. This is a .csv file with the first column denoting the sequence name, and the second column, its most resolved taxonomic rank. ''' self.encountered_nodes = {} self.encountered_taxonomies = set() self.tree = tree # Read in taxonomy logging.info("Reading in taxonomy") if seqinfo: logging.info("Importing taxtastic taxonomy from files: %s and %s" % (taxonomy, seqinfo)) gtns = Getaxnseq() self.taxonomy = gtns.read_taxtastic_taxonomy_and_seqinfo(open(taxonomy), open(seqinfo)) else: try: logging.info("Reading Greengenes style taxonomy") self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy except MalformedGreenGenesTaxonomyException: raise Exception("Failed to read taxonomy as a Greengenes \ formatted file. Was a taxtastic style \ taxonomy provided with no seqinfo file?")
def test_read_semicolon_no_space(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\ GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria')).taxonomy)
def test_strip_identifier(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\ GreenGenesTaxonomy.read(StringIO('seq1 \tbacteria;cyanobacteria;\n'\ 'seq2\tbacteria;bluebacteria;;\n' )).taxonomy)
def test_ignores_empty_lines(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\ GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\ 'seq2\tbacteria;bluebacteria;;\n'\ '\n' )).taxonomy)
def test_removes_empties_at_end(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\ GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\ 'seq2\tbacteria;bluebacteria;;\n' )).taxonomy)
def test_raises_when_missing_middle(self): with self.assertRaises(MalformedGreenGenesTaxonomyException): GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\ 'seq2\tbacteria;;cyanobacteria\n' ))
def test_raises_when_duplicate_names(self): with self.assertRaises(DuplicateTaxonomyException): GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\ 'seq1\tbacteria;cyanobacteria\n' ))
def test_ok_when_taxonomy_empty(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': []},\ GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\ 'seq2\t\n' )).taxonomy)
required=True) args = parser.parse_args() if args.debug: loglevel = logging.DEBUG elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.INFO logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # Read in taxonomy logging.info("Reading taxonomy..") gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy logging.info("Read in %i taxonomies" % len(gg)) # Read in sequence logging.info("Reading sequences..") duplicates = set() sequences = {} for name, seq, _ in SequenceIO()._readfq(open(args.sequences)): if name in sequences: logging.error("Duplicate sequence name %s" % name) duplicates.add(name) else: sequences[name] = seq logging.warn("Found %i duplicated IDs" % len(duplicates)) for dup in duplicates: del sequences[dup]
parser.add_argument('--greengenes_taxonomy', help='tab then semi-colon separated "GreenGenes"-skyle format definition of taxonomies', required=True) parser.add_argument('--sequences', help='FASTA file of sequences to be compared', required=True) args = parser.parse_args() if args.debug: loglevel = logging.DEBUG elif args.quiet: loglevel = logging.ERROR else: loglevel = logging.INFO logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # Read in taxonomy logging.info("Reading taxonomy..") gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy logging.info("Read in %i taxonomies" % len(gg)) # Read in sequence logging.info("Reading sequences..") duplicates = set() sequences = {} for name, seq, _ in SequenceIO()._readfq(open(args.sequences)): if name in sequences: logging.error("Duplicate sequence name %s" % name) duplicates.add(name) else: sequences[name] = seq logging.warn("Found %i duplicated IDs" % len(duplicates)) for dup in duplicates: del sequences[dup]
def test_read_hello_world(self): self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\ GreenGenesTaxonomy.read(StringIO('seq1\tbacteria; cyanobacteria')).taxonomy)
def update(self, **kwargs): ''' Update an existing GraftM package with new sequences and taxonomy. If no taxonomy is provided, attempt to decorate the new sequences with pre-existing taxonomy. Parameters ---------- input_sequence_path: str Path to FASTA file containing sequences to add to the update GraftM package input_taxonomy_path: str Taxonomy corresponding to the sequences in input_sequence_path. If None, then attempt to assign taxonomy by decorating the tree made out of all sequences. input_graftm_package_path: str Path to the directory of the GraftM package that is to be updated output_graftm_package_path: str Path to the directory to which the new GraftM package will be written to ''' input_sequence_path = kwargs.pop('input_sequence_path') input_taxonomy_path = kwargs.pop('input_taxonomy_path', None) input_graftm_package_path = kwargs.pop('input_graftm_package_path') output_graftm_package_path = kwargs.pop('output_graftm_package_path') threads = kwargs.pop( 'threads', UpdateDefaultOptions.threads) #TODO: add to user options if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Reading previous GraftM package") old_gpkg = GraftMPackage.acquire(input_graftm_package_path) min_input_version = 3 if old_gpkg.version < min_input_version: raise InsufficientGraftMPackageVersion( "GraftM below version %s cannot be updated using the update function." % min_input_version + " Unaligned sequences are not included in these packages, therefore no new" " alignment/HMM/Tree can be created") new_gpkg = UpdatedGraftMPackage() new_gpkg.output = output_graftm_package_path new_gpkg.name = output_graftm_package_path.replace(".gpkg", "") ####################################### ### Collect all unaligned sequences ### logging.info("Concatenating unaligned sequence files") new_gpkg.unaligned_sequences = "%s_sequences.fa" % ( new_gpkg.name ) #TODO: replace hard-coded paths like this with tempfiles self._concatenate_file( [old_gpkg.unaligned_sequence_database_path(), input_sequence_path], new_gpkg.unaligned_sequences) ######################################################### ### Parse taxonomy info up front so errors come early ### if input_taxonomy_path: logging.info("Reading new taxonomy information") input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path) original_taxonomy_hash = old_gpkg.taxonomy_hash() total_taxonomy_hash = original_taxonomy_hash.copy() total_taxonomy_hash.update(input_taxonomy.taxonomy) num_duplicate_taxonomies = len(total_taxonomy_hash) - \ len(input_taxonomy.taxonomy) - \ len(original_taxonomy_hash) logging.debug( "Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies) if num_duplicate_taxonomies > 0: logging.warn( "Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies) ############################### ### Re-construct alignments ### logging.info("Multiple sequence aligning all sequences") new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name) self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads) ######################## ### Re-construct HMM ### logging.info("Creating HMM from alignment") new_gpkg.hmm = "%s.hmm" % (new_gpkg.name) new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name) self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment) ######################### ### Re-construct tree ### logging.info("Generating phylogenetic tree") new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name) new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name) new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type( old_gpkg.alignment_hmm_path()) new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \ self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name, new_gpkg.package_type, self.fasttree) ############################################## ### Re-root and decorate tree if necessary ### if input_taxonomy_path: new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree else: logging.info("Finding taxonomy for new sequences") rerooter = Rerooter() old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(), schema='newick') new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree, schema='newick') old_tree = rerooter.reroot(old_tree) new_tree = rerooter.reroot(new_tree) # TODO: Shouldn't call an underscore method, eventually use # Rerooter instead. rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree) new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name td = TreeDecorator(rerooted_tree, old_gpkg.taxtastic_taxonomy_path(), old_gpkg.taxtastic_seqinfo_path()) with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy: td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) total_taxonomy_hash = GreenGenesTaxonomy.read_file( taxonomy.name).taxonomy ################################ ### Generating tree log file ### logging.info("Generating phylogenetic tree log file") new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name self._generate_tree_log_file(new_gpkg.unrooted_tree, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.package_type, self.fasttree) ################################ ### Creating taxtastic files ### logging.info("Writing new taxonomy files") new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name gtns = Getaxnseq() gtns.write_taxonomy_and_seqinfo_files(total_taxonomy_hash, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo) ###################### ### Compile refpkg ### logging.info("Compiling pplacer refpkg") new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name) refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo, new_gpkg.refpkg, True) ##################################### ### Re-construct diamond database ### logging.info("Recreating DIAMOND DB") new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name) self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name) #################### ### Compile gpkg ### logging.info("Compiling GraftM package") new_gpkg.name = "%s.gpkg" % new_gpkg.name GraftMPackageVersion3.compile( new_gpkg.name, new_gpkg.refpkg, new_gpkg.hmm, new_gpkg.diamond_database, self._define_range(new_gpkg.unaligned_sequences), new_gpkg.unaligned_sequences, search_hmm_files=old_gpkg.search_hmm_paths()) ################### ### Test it out ### logging.info("Testing newly updated GraftM package works") self._test_package(new_gpkg.name) logging.info("Finished")
def run(self, **kwargs): forward_read_files = kwargs.pop('sequences') output_otu_table = kwargs.pop('otu_table', None) archive_otu_table = kwargs.pop('archive_otu_table', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables') singlem_assignment_method = kwargs.pop('assignment_method') output_jplace = kwargs.pop('output_jplace') output_extras = kwargs.pop('output_extras') evalue = kwargs.pop('evalue') min_orf_length = kwargs.pop('min_orf_length') restrict_read_length = kwargs.pop('restrict_read_length') filter_minimum_protein = kwargs.pop('filter_minimum_protein') filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide') include_inserts = kwargs.pop('include_inserts') singlem_packages = kwargs.pop('singlem_packages') window_size = kwargs.pop('window_size') assign_taxonomy = kwargs.pop('assign_taxonomy') known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy') working_directory = kwargs.pop('working_directory') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) self._num_threads = num_threads self._evalue = evalue self._min_orf_length = min_orf_length self._restrict_read_length = restrict_read_length self._filter_minimum_protein = filter_minimum_protein self._filter_minimum_nucleotide = filter_minimum_nucleotide hmms = HmmDatabase(singlem_packages) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD else: graftm_assignment_method = singlem_assignment_method if logging.getLevelName(logging.getLogger().level) == 'DEBUG': self._graftm_verbosity = '5' else: self._graftm_verbosity = '2' using_temporary_working_directory = working_directory is None if using_temporary_working_directory: shared_mem_directory = '/dev/shm' if os.path.exists(shared_mem_directory): logging.debug("Using shared memory as a base directory") tmp = tempdir.TempDir(basedir=shared_mem_directory) tempfiles_path = os.path.join(tmp.name, 'tempfiles') os.mkdir(tempfiles_path) os.environ['TEMP'] = tempfiles_path else: logging.debug("Shared memory directory not detected, using default temporary directory instead") tmp = tempdir.TempDir() working_directory = tmp.name else: working_directory = working_directory if os.path.exists(working_directory): if force: logging.info("Overwriting directory %s" % working_directory) shutil.rmtree(working_directory) os.mkdir(working_directory) else: raise Exception("Working directory '%s' already exists, not continuing" % working_directory) else: os.mkdir(working_directory) logging.debug("Using working directory %s" % working_directory) self._working_directory = working_directory extracted_reads = None def return_cleanly(): if extracted_reads: extracted_reads.cleanup() if using_temporary_working_directory: tmp.dissolve() logging.info("Finished") #### Search self._singlem_package_database = hmms search_result = self._search(hmms, forward_read_files) sample_names = search_result.samples_with_hits() if len(sample_names) == 0: logging.info("No reads identified in any samples, stopping") return_cleanly() return logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \ % (len(sample_names), sample_names[0])) #### Alignment align_result = self._align(search_result) ### Extract reads that have already known taxonomy if known_otu_tables: logging.info("Parsing known taxonomy OTU tables") known_taxes = KnownOtuTable() known_taxes.parse_otu_tables(known_otu_tables) logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes)) else: known_taxes = [] if known_sequence_taxonomy: logging.debug("Parsing sequence-wise taxonomy..") tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy known_sequence_tax = {} for seq_id, tax in tax1.items(): known_sequence_tax[seq_id] = '; '.join(tax) logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax)) ### Extract other reads which do not have known taxonomy extracted_reads = self._extract_relevant_reads( align_result, include_inserts, known_taxes) logging.info("Finished extracting aligned sequences") #### Taxonomic assignment if assign_taxonomy: logging.info("Running taxonomic assignment with graftm..") assignment_result = self._assign_taxonomy( extracted_reads, graftm_assignment_method) #### Process taxonomically assigned reads # get the sequences out for each of them otu_table_object = OtuTable() regular_output_fields = split('gene sample sequence num_hits coverage taxonomy') otu_table_object.fields = regular_output_fields + \ split('read_names nucleotides_aligned taxonomy_by_known?') for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads: def add_info(infos, otu_table_object, known_tax): for info in infos: to_print = [ singlem_package.graftm_package_basename(), sample_name, info.seq, info.count, info.coverage, info.taxonomy, info.names, info.aligned_lengths, known_tax] otu_table_object.data.append(to_print) known_infos = self._seqs_to_counts_and_taxonomy( known_sequences, known_taxes, False, True) add_info(known_infos, otu_table_object, True) if tmp_graft: # if any sequences were aligned (not just already known) tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta if assign_taxonomy: is_known_taxonomy = False aligned_seqs = self._get_windowed_sequences( assignment_result.prealigned_sequence_file( sample_name, singlem_package, tmpbase), assignment_result.nucleotide_hits_file( sample_name, singlem_package, tmpbase), singlem_package, include_inserts) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: tax_file = assignment_result.diamond_assignment_file( sample_name, singlem_package, tmpbase) else: tax_file = assignment_result.read_tax_file( sample_name, singlem_package, tmpbase) logging.debug("Reading taxonomy from %s" % tax_file) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: taxonomies = DiamondResultParser(tax_file) use_first = True else: if not os.path.isfile(tax_file): logging.warn("Unable to find tax file for gene %s from sample %s " "(likely do to min length filtering), skipping" % ( os.path.basename(singlem_package.base_directory()), sample_name)) taxonomies = {} else: taxonomies = TaxonomyFile(tax_file) use_first = False else: # Taxonomy has not been assigned. aligned_seqs = unknown_sequences if known_sequence_taxonomy: taxonomies = known_sequence_tax else: taxonomies = {} use_first = False # irrelevant is_known_taxonomy = True new_infos = list(self._seqs_to_counts_and_taxonomy( aligned_seqs, taxonomies, use_first, False)) add_info(new_infos, otu_table_object, is_known_taxonomy) if output_jplace: base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) input_jplace_file = os.path.join(base_dir, "placements.jplace") output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % ( output_jplace, sample_name, singlem_package.graftm_package_basename())) logging.debug("Converting jplace file %s to singlem jplace file %s" % ( input_jplace_file, output_jplace_file)) with open(output_jplace_file, 'w') as output_jplace_io: self._write_jplace_from_infos( open(input_jplace_file), new_infos, output_jplace_io) if output_otu_table: with open(output_otu_table, 'w') as f: if output_extras: otu_table_object.write_to(f, otu_table_object.fields) else: otu_table_object.write_to(f, regular_output_fields) if archive_otu_table: with open(archive_otu_table, 'w') as f: otu_table_object.archive(hmms.singlem_packages).write_to(f) return_cleanly()
def main(self, **kwargs): alignment = kwargs.pop('alignment',None) sequences = kwargs.pop('sequences',None) taxonomy = kwargs.pop('taxonomy',None) rerooted_tree = kwargs.pop('rerooted_tree',None) unrooted_tree = kwargs.pop('unrooted_tree',None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files',None) min_aligned_percent = kwargs.pop('min_aligned_percent',0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force',False) graftm_package = kwargs.pop('graftm_package',False) dereplication_level = kwargs.pop('dereplication_level',False) threads = kwargs.pop('threads',5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] tempfiles_to_close = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if user_hmm: align_hmm = user_hmm else: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100)) for s in insufficiently_aligned_sequences: logging.warn("Insufficient alignment of %s, not including this sequence" % s) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(alignment2_fh) alignment2 = alignment2_fh.name num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info("Reconstructing the alignment and HMM from remaining sequences") output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa') tempfiles_to_close.append(output_alignment_fh) output_alignment = output_alignment_fh.name if not user_hmm: align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm') tempfiles_to_close.append(align_hmm_fh) align_hmm = align_hmm_fh.name ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm') tempfiles_to_close.append(search_hmm_fh) search_hmm = search_hmm_fh.name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error("Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception("All sequences must be assigned a taxonomy, cannot continue") logging.debug("Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base+"_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names=[] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa') tempfiles_to_close.append(sequences2_fh) sequences2 = sequences2_fh.name # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file=rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm') tempfiles_to_close.append(log_file_tempfile) log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') tempfiles_to_close.append(tre_file_tempfile) tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep=[ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base+"_seqinfo.csv" tax = base+"_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info("Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info("Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy else: raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree") taxonomy_definition = {x:taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep} gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) for tf in tempfiles_to_close: tf.close() # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")
def main(self, **kwargs): alignment = kwargs.pop('alignment', None) sequences = kwargs.pop('sequences', None) taxonomy = kwargs.pop('taxonomy', None) rerooted_tree = kwargs.pop('rerooted_tree', None) unrooted_tree = kwargs.pop('unrooted_tree', None) tree_log = kwargs.pop('tree_log', None) prefix = kwargs.pop('prefix', None) rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None) user_hmm = kwargs.pop('hmm', None) search_hmm_files = kwargs.pop('search_hmm_files', None) min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01) taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None) taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None) force_overwrite = kwargs.pop('force', False) graftm_package = kwargs.pop('graftm_package', False) dereplication_level = kwargs.pop('dereplication_level', False) threads = kwargs.pop('threads', 5) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) seqio = SequenceIO() locus_name = (os.path.basename(sequences).split('.')[0] if sequences else os.path.basename(alignment).split('.')[0]) tmp = tempdir.TempDir() base = os.path.join(tmp.name, locus_name) insufficiently_aligned_sequences = [None] removed_sequence_names = [] if prefix: output_gpkg_path = prefix else: output_gpkg_path = "%s.gpkg" % locus_name if os.path.exists(output_gpkg_path): if force_overwrite: logging.warn("Deleting previous directory %s" % output_gpkg_path) shutil.rmtree(output_gpkg_path) else: raise Exception( "Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path) logging.info("Building gpkg for %s" % output_gpkg_path) # Read in taxonomy somehow gtns = Getaxnseq() if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree") taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\ Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy elif taxtastic_seqinfo and taxtastic_taxonomy: logging.info( "Reading taxonomy from taxtastic taxonomy and seqinfo files") taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\ (open(taxtastic_taxonomy), open(taxtastic_seqinfo)) else: raise Exception( "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) # Check for duplicates logging.info("Checking for duplicate sequences") dup = self._check_for_duplicate_sequence_names(sequences) if dup: raise Exception( "Found duplicate sequence name '%s' in sequences input file" % dup) output_alignment = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa').name align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile( prefix='graftm', suffix='_align.hmm').name) if alignment: dup = self._check_for_duplicate_sequence_names(alignment) if dup: raise Exception( "Found duplicate sequence name '%s' in alignment input file" % dup) ptype = self._get_hmm_from_alignment(alignment, align_hmm, output_alignment) else: logging.info("Aligning sequences to create aligned FASTA file") ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) while len(insufficiently_aligned_sequences) > 0: logging.warn( "One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent * 100)) for s in insufficiently_aligned_sequences: logging.warn( "Insufficient alignment of %s, not including this sequence" % s) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, sequences, sequences2) sequences = sequences2 if alignment: _, alignment2 = tempfile.mkstemp(prefix='graftm', suffix='.aln.faa') num_sequences = self._remove_sequences_from_alignment( insufficiently_aligned_sequences, alignment, alignment2) alignment = alignment2 for name in insufficiently_aligned_sequences: if rerooted_tree or rerooted_annotated_tree: logging.warning( '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name) removed_sequence_names.append(name) logging.info( "After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences)) if num_sequences < 4: raise Exception( "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences) else: logging.info( "Reconstructing the alignment and HMM from remaining sequences" ) output_alignment = tempfile.NamedTemporaryFile( prefix='graftm', suffix='.aln.faa').name if not user_hmm: align_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm').name ptype, output_alignment = self._align_and_create_hmm( sequences, alignment, user_hmm, align_hmm, output_alignment, threads) logging.info("Checking for incorrect or fragmented reads") insufficiently_aligned_sequences = self._check_reads_hit( open(output_alignment), min_aligned_percent) if not search_hmm_files: search_hmm = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm').name self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads) search_hmm_files = [search_hmm] # Make sure each sequence has been assigned a taxonomy: aligned_sequence_objects = seqio.read_fasta_file(output_alignment) unannotated = [] for s in aligned_sequence_objects: if s.name not in taxonomy_definition: unannotated.append(s.name) if len(unannotated) > 0: for s in unannotated: logging.error( "Unable to find sequence '%s' in the taxonomy definition" % s) raise Exception( "All sequences must be assigned a taxonomy, cannot continue") logging.debug( "Looking for non-standard characters in aligned sequences") self._mask_strange_sequence_letters(aligned_sequence_objects, ptype) # Deduplicate sequences - pplacer cannot handle these logging.info("Deduplicating sequences") dedup = Deduplicator() deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects) deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition) deduplicated_taxonomy_hash = {} for i, tax in enumerate(deduplicated_taxonomy): deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax deduplicated_alignment_file = base + "_deduplicated_aligned.fasta" seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays], deduplicated_alignment_file) logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\ % ((len(aligned_sequence_objects)-len(deduplicated_arrays)), len(deduplicated_arrays))) # Get corresponding unaligned sequences filtered_names = [] for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]: for seq in list: filtered_names.append(seq.name) _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa') # Create tree unless one was provided if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree: logging.debug("No tree provided") logging.info("Building tree") log_file, tre_file = self._build_tree(deduplicated_alignment_file, base, ptype, self.fasttree) no_reroot = False else: if rerooted_tree: logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_tree no_reroot = True elif rerooted_annotated_tree: logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree) tre_file = rerooted_annotated_tree no_reroot = True elif unrooted_tree: logging.info("Using input unrooted tree") tre_file = unrooted_tree no_reroot = False else: raise # Remove any sequences from the tree that are duplicates cleaner = DendropyTreeCleaner() tree = Tree.get(path=tre_file, schema='newick') for group in deduplicated_arrays: [removed_sequence_names.append(s.name) for s in group[1:]] cleaner.remove_sequences(tree, removed_sequence_names) # Ensure there is nothing amiss now as a user-interface thing cleaner.match_alignment_and_tree_sequence_ids(\ [g[0].name for g in deduplicated_arrays], tree) if tree_log: # User specified a log file, go with that logging.debug("Using user-specified log file %s" % tree_log) log_file = tree_log else: logging.info("Generating log file") log_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree_log', prefix='graftm') log_file = log_file_tempfile.name tre_file_tempfile = tempfile.NamedTemporaryFile( suffix='.tree', prefix='graftm') tre_file = tre_file_tempfile.name with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f: # Make the newick file simple (ie. un-arb it) for fasttree. cleaner.write_fasttree_newick(tree, f) f.flush() self._generate_tree_log_file(f.name, deduplicated_alignment_file, tre_file, log_file, ptype, self.fasttree) # Create tax and seqinfo .csv files taxonomy_to_keep = [ seq.name for seq in [x for x in [x[0] for x in deduplicated_arrays] if x] ] refpkg = "%s.refpkg" % output_gpkg_path self.the_trash.append(refpkg) if taxtastic_taxonomy and taxtastic_seqinfo: logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, taxtastic_taxonomy, taxtastic_seqinfo, refpkg, no_reroot) else: gtns = Getaxnseq() seq = base + "_seqinfo.csv" tax = base + "_taxonomy.csv" self.the_trash += [seq, tax] if rerooted_annotated_tree: logging.info( "Building seqinfo and taxonomy file from input annotated tree" ) taxonomy_definition = TaxonomyExtractor( ).taxonomy_from_annotated_tree( Tree.get(path=rerooted_annotated_tree, schema='newick')) elif taxonomy: logging.info( "Building seqinfo and taxonomy file from input taxonomy") taxonomy_definition = GreenGenesTaxonomy.read_file( taxonomy).taxonomy else: raise Exception( "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree" ) taxonomy_definition = { x: taxonomy_definition[x] for x in taxonomy_definition if x in taxonomy_to_keep } gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax, seq) # Create the reference package logging.info("Creating reference package") refpkg = self._taxit_create(base, deduplicated_alignment_file, tre_file, log_file, tax, seq, refpkg, no_reroot) if sequences: # Run diamond makedb logging.info("Creating diamond database") if ptype == Create._PROTEIN_PACKAGE_TYPE: cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base) extern.run(cmd) diamondb = '%s.dmnd' % base elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: diamondb = None else: raise Exception("Programming error") else: diamondb = None if sequences: # Get range max_range = self._define_range(sequences) else: max_range = self._define_range(alignment) # Compile the gpkg logging.info("Compiling gpkg") GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb, max_range, sequences, search_hmm_files=search_hmm_files) logging.info("Cleaning up") self._cleanup(self.the_trash) # Test out the gpkg just to be sure. # # TODO: Use graftM through internal means rather than via extern. This # requires some refactoring so that graft() can be called easily with # sane defaults. logging.info("Testing gpkg package works") self._test_package(output_gpkg_path) logging.info("Finished\n")
def update(self, **kwargs): ''' Update an existing GraftM package with new sequences and taxonomy. If no taxonomy is provided, attempt to decorate the new sequences with pre-existing taxonomy. Parameters ---------- input_sequence_path: str Path to FASTA file containing sequences to add to the update GraftM package input_taxonomy_path: str Taxonomy corresponding to the sequences in input_sequence_path. If None, then attempt to assign taxonomy by decorating the tree made out of all sequences. input_graftm_package_path: str Path to the directory of the GraftM package that is to be updated output_graftm_package_path: str Path to the directory to which the new GraftM package will be written to ''' input_sequence_path = kwargs.pop('input_sequence_path') input_taxonomy_path = kwargs.pop('input_taxonomy_path', None) input_graftm_package_path = kwargs.pop('input_graftm_package_path') output_graftm_package_path = kwargs.pop('output_graftm_package_path') threads = kwargs.pop('threads', UpdateDefaultOptions.threads) #TODO: add to user options if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Reading previous GraftM package") old_gpkg = GraftMPackage.acquire(input_graftm_package_path) min_input_version = 3 if old_gpkg.version < min_input_version: raise InsufficientGraftMPackageVersion( "GraftM below version %s cannot be updated using the update function." % min_input_version + " Unaligned sequences are not included in these packages, therefore no new" " alignment/HMM/Tree can be created") new_gpkg = UpdatedGraftMPackage() new_gpkg.output = output_graftm_package_path new_gpkg.name = output_graftm_package_path.replace(".gpkg", "") ####################################### ### Collect all unaligned sequences ### logging.info("Concatenating unaligned sequence files") new_gpkg.unaligned_sequences = "%s_sequences.fa" % (new_gpkg.name) #TODO: replace hard-coded paths like this with tempfiles self._concatenate_file([old_gpkg.unaligned_sequence_database_path(), input_sequence_path], new_gpkg.unaligned_sequences) ######################################################### ### Parse taxonomy info up front so errors come early ### if input_taxonomy_path: logging.info("Reading new taxonomy information") input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path) original_taxonomy_hash = old_gpkg.taxonomy_hash() total_taxonomy_hash = original_taxonomy_hash.copy() total_taxonomy_hash.update(input_taxonomy.taxonomy) num_duplicate_taxonomies = len(total_taxonomy_hash) - \ len(input_taxonomy.taxonomy) - \ len(original_taxonomy_hash) logging.debug("Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies) if num_duplicate_taxonomies > 0: logging.warn("Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies) ############################### ### Re-construct alignments ### logging.info("Multiple sequence aligning all sequences") new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name) self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads) ######################## ### Re-construct HMM ### logging.info("Creating HMM from alignment") new_gpkg.hmm = "%s.hmm" % (new_gpkg.name) new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name) self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment) ######################### ### Re-construct tree ### logging.info("Generating phylogenetic tree") new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name) new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name) new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(old_gpkg.alignment_hmm_path()) new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \ self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name, new_gpkg.package_type, self.fasttree) ############################################## ### Re-root and decorate tree if necessary ### if input_taxonomy_path: new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree else: logging.info("Finding taxonomy for new sequences") rerooter = Rerooter() old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(), schema='newick') new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree, schema='newick') old_tree = rerooter.reroot(old_tree) new_tree = rerooter.reroot(new_tree) # TODO: Shouldn't call an underscore method, eventually use # Rerooter instead. rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree) new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name td = TreeDecorator( rerooted_tree, old_gpkg.taxtastic_taxonomy_path(), old_gpkg.taxtastic_seqinfo_path()) with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy: td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) total_taxonomy_hash = GreenGenesTaxonomy.read_file(taxonomy.name).taxonomy ################################ ### Generating tree log file ### logging.info("Generating phylogenetic tree log file") new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name self._generate_tree_log_file(new_gpkg.unrooted_tree, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.package_type, self.fasttree) ################################ ### Creating taxtastic files ### logging.info("Writing new taxonomy files") new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name gtns = Getaxnseq() gtns.write_taxonomy_and_seqinfo_files( total_taxonomy_hash, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo) ###################### ### Compile refpkg ### logging.info("Compiling pplacer refpkg") new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name) refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo, new_gpkg.refpkg, True) ##################################### ### Re-construct diamond database ### logging.info("Recreating DIAMOND DB") new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name) self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name) #################### ### Compile gpkg ### logging.info("Compiling GraftM package") new_gpkg.name = "%s.gpkg" % new_gpkg.name GraftMPackageVersion3.compile(new_gpkg.name, new_gpkg.refpkg, new_gpkg.hmm, new_gpkg.diamond_database, self._define_range(new_gpkg.unaligned_sequences), new_gpkg.unaligned_sequences, search_hmm_files=old_gpkg.search_hmm_paths()) ################### ### Test it out ### logging.info("Testing newly updated GraftM package works") self._test_package(new_gpkg.name) logging.info("Finished")
def test_raises_when_incorrect_num_fields(self): with self.assertRaises(MalformedGreenGenesTaxonomyException): GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\ 'seq2\n' ))
def run_to_otu_table(self, **kwargs): '''Run the pipe, ''' forward_read_files = kwargs.pop('sequences') num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables') singlem_assignment_method = kwargs.pop('assignment_method') output_jplace = kwargs.pop('output_jplace') evalue = kwargs.pop('evalue') min_orf_length = kwargs.pop('min_orf_length') restrict_read_length = kwargs.pop('restrict_read_length') filter_minimum_protein = kwargs.pop('filter_minimum_protein') filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide') include_inserts = kwargs.pop('include_inserts') singlem_packages = kwargs.pop('singlem_packages') assign_taxonomy = kwargs.pop('assign_taxonomy') known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy') working_directory = kwargs.pop('working_directory') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) self._num_threads = num_threads self._evalue = evalue self._min_orf_length = min_orf_length self._restrict_read_length = restrict_read_length self._filter_minimum_protein = filter_minimum_protein self._filter_minimum_nucleotide = filter_minimum_nucleotide hmms = HmmDatabase(singlem_packages) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD else: graftm_assignment_method = singlem_assignment_method if logging.getLevelName(logging.getLogger().level) == 'DEBUG': self._graftm_verbosity = '5' else: self._graftm_verbosity = '2' if not assign_taxonomy: singlem_assignment_method = NO_ASSIGNMENT_METHOD using_temporary_working_directory = working_directory is None if using_temporary_working_directory: shared_mem_directory = '/dev/shm' if os.path.exists(shared_mem_directory): logging.debug("Using shared memory as a base directory") tmp = tempdir.TempDir(basedir=shared_mem_directory) tempfiles_path = os.path.join(tmp.name, 'tempfiles') os.mkdir(tempfiles_path) os.environ['TEMP'] = tempfiles_path else: logging.debug( "Shared memory directory not detected, using default temporary directory instead" ) tmp = tempdir.TempDir() working_directory = tmp.name else: working_directory = working_directory if os.path.exists(working_directory): if force: logging.info("Overwriting directory %s" % working_directory) shutil.rmtree(working_directory) os.mkdir(working_directory) else: raise Exception( "Working directory '%s' already exists, not continuing" % working_directory) else: os.mkdir(working_directory) logging.debug("Using working directory %s" % working_directory) self._working_directory = working_directory extracted_reads = None def return_cleanly(): if using_temporary_working_directory: tmp.dissolve() logging.info("Finished") #### Search self._singlem_package_database = hmms search_result = self._search(hmms, forward_read_files) sample_names = search_result.samples_with_hits() if len(sample_names) == 0: logging.info("No reads identified in any samples, stopping") return_cleanly() return None logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \ % (len(sample_names), sample_names[0])) #### Alignment align_result = self._align(search_result) ### Extract reads that have already known taxonomy if known_otu_tables: logging.info("Parsing known taxonomy OTU tables") known_taxes = KnownOtuTable() known_taxes.parse_otu_tables(known_otu_tables) logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes)) else: known_taxes = [] if known_sequence_taxonomy: logging.debug("Parsing sequence-wise taxonomy..") tax1 = GreenGenesTaxonomy.read( open(known_sequence_taxonomy)).taxonomy known_sequence_tax = {} for seq_id, tax in tax1.items(): known_sequence_tax[seq_id] = '; '.join(tax) logging.info( "Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax)) ### Extract other reads which do not have known taxonomy extracted_reads = self._extract_relevant_reads(align_result, include_inserts, known_taxes) logging.info("Finished extracting aligned sequences") #### Taxonomic assignment if assign_taxonomy: logging.info("Running taxonomic assignment with GraftM..") assignment_result = self._assign_taxonomy( extracted_reads, graftm_assignment_method) #### Process taxonomically assigned reads # get the sequences out for each of them otu_table_object = OtuTable() if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD: package_to_taxonomy_bihash = {} for readset in extracted_reads: sample_name = readset.sample_name singlem_package = readset.singlem_package known_sequences = readset.known_sequences def add_info(infos, otu_table_object, known_tax): for info in infos: to_print = [ singlem_package.graftm_package_basename(), sample_name, info.seq, info.count, info.coverage, info.taxonomy, info.names, info.aligned_lengths, known_tax ] otu_table_object.data.append(to_print) known_infos = self._seqs_to_counts_and_taxonomy( known_sequences, NO_ASSIGNMENT_METHOD, known_taxes, known_sequence_taxonomy, None) add_info(known_infos, otu_table_object, True) if len( readset.unknown_sequences ) > 0: # if any sequences were aligned (not just already known) tmpbase = readset.tmpfile_basename if assign_taxonomy: is_known_taxonomy = False aligned_seqs = list( itertools.chain(readset.unknown_sequences, readset.known_sequences)) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: tax_file = assignment_result.diamond_assignment_file( sample_name, singlem_package, tmpbase) taxonomies = DiamondResultParser(tax_file) elif singlem_assignment_method == DIAMOND_ASSIGNMENT_METHOD: tax_file = assignment_result.read_tax_file( sample_name, singlem_package, tmpbase) if not os.path.isfile(tax_file): logging.warn( "Unable to find tax file for gene %s from sample %s " "(likely do to min length filtering), skipping" % (os.path.basename( singlem_package.base_directory()), sample_name)) taxonomies = {} else: taxonomies = TaxonomyFile(tax_file) elif singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD: bihash_key = singlem_package.base_directory() if bihash_key in package_to_taxonomy_bihash: taxonomy_bihash = package_to_taxonomy_bihash[ bihash_key] else: taxtastic_taxonomy = singlem_package.graftm_package( ).taxtastic_taxonomy_path() logging.debug( "Reading taxtastic taxonomy from %s" % taxtastic_taxonomy) with open(taxtastic_taxonomy) as f: taxonomy_bihash = TaxonomyBihash.parse_taxtastic_taxonomy( f) package_to_taxonomy_bihash[ bihash_key] = taxonomy_bihash base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) jplace_file = os.path.join(base_dir, "placements.jplace") logging.debug( "Attempting to read jplace output from %s" % jplace_file) if os.path.exists(jplace_file): with open(jplace_file) as f: jplace_json = json.loads(f.read()) placement_parser = PlacementParser( jplace_json, taxonomy_bihash, 0.5) else: # Sometimes alignments are filtered out. placement_parser = None taxonomies = {} elif singlem_assignment_method == NO_ASSIGNMENT_METHOD: taxonomies = {} else: raise Exception("Programming error") else: # Taxonomy has not been assigned. aligned_seqs = readset.unknown_sequences if known_sequence_taxonomy: taxonomies = known_sequence_tax else: taxonomies = {} is_known_taxonomy = True new_infos = list( self._seqs_to_counts_and_taxonomy( aligned_seqs, singlem_assignment_method, known_sequence_tax if known_sequence_taxonomy else {}, taxonomies, placement_parser if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD else None)) add_info(new_infos, otu_table_object, is_known_taxonomy) if output_jplace: base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) input_jplace_file = os.path.join(base_dir, "placements.jplace") output_jplace_file = "%s_%s_%s.jplace" % ( output_jplace, sample_name, singlem_package.graftm_package_basename()) logging.info("Writing jplace file '%s'" % output_jplace_file) logging.debug( "Converting jplace file %s to singlem jplace file %s" % (input_jplace_file, output_jplace_file)) with open(output_jplace_file, 'w') as output_jplace_io: self._write_jplace_from_infos(open(input_jplace_file), new_infos, output_jplace_io) return_cleanly() return otu_table_object