Exemplo n.º 1
0
 def test_strip_identifier(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1 \tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'
                                        )).taxonomy)
    parser.add_argument('--greengenes_taxonomy', help='tab then semi-colon separated "GreenGenes"-skyle format definition of taxonomies', required=True)
    parser.add_argument('--sequences', help='FASTA file of sequences to be compared', required=True)

    args = parser.parse_args()
    if args.debug:
        loglevel = logging.DEBUG
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.INFO
    logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    
    
    # Read in taxonomy
    logging.info("Reading taxonomy..")
    gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy
    logging.info("Read in %i taxonomies" % len(gg))
    
    # Read in sequence
    logging.info("Reading sequences..")
    duplicates = set()
    sequences = {}
    for name, seq, _  in SequenceIO()._readfq(open(args.sequences)):
        if name in sequences:
            logging.error("Duplicate sequence name %s" % name)
            duplicates.add(name)
        else:
            sequences[name] = seq
    logging.warn("Found %i duplicated IDs" % len(duplicates))
    for dup in duplicates:
        del sequences[dup]
Exemplo n.º 3
0
 def test_removes_empties_at_end(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'
                                        )).taxonomy)
Exemplo n.º 4
0
 def test_ignores_empty_lines(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'\
                                        '\n'
                                        )).taxonomy)
Exemplo n.º 5
0
 def test_raises_when_duplicate_names(self):
     with self.assertRaises(DuplicateTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq1\tbacteria;cyanobacteria\n'
                                        ))
Exemplo n.º 6
0
 def test_raises_when_missing_middle(self):
     with self.assertRaises(MalformedGreenGenesTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\tbacteria;;cyanobacteria\n'
                                        ))
Exemplo n.º 7
0
 def test_raises_when_incorrect_num_fields(self):
     with self.assertRaises(MalformedGreenGenesTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\n'
                                        ))
Exemplo n.º 8
0
 def test_ok_when_taxonomy_empty(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': []},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\t\n'
                                        )).taxonomy)
Exemplo n.º 9
0
 def test_read_semicolon_no_space(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\
                       GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria')).taxonomy)
Exemplo n.º 10
0
 def test_read_hello_world(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\
                       GreenGenesTaxonomy.read(StringIO('seq1\tbacteria; cyanobacteria')).taxonomy)
Exemplo n.º 11
0
    def run(self, **kwargs):
        forward_read_files = kwargs.pop('sequences')
        output_otu_table = kwargs.pop('otu_table', None)
        archive_otu_table = kwargs.pop('archive_otu_table', None)
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        output_extras = kwargs.pop('output_extras')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        window_size = kwargs.pop('window_size')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        
        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method
            
        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug("Shared memory directory not detected, using default temporary directory instead")
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" % working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception("Working directory '%s' already exists, not continuing" % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory

        extracted_reads = None
        def return_cleanly():
            if extracted_reads: extracted_reads.cleanup()
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(
            align_result, include_inserts, known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with graftm..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        regular_output_fields = split('gene sample sequence num_hits coverage taxonomy')
        otu_table_object.fields = regular_output_fields + \
                                  split('read_names nucleotides_aligned taxonomy_by_known?')

        for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads:
            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(),
                        sample_name,
                        info.seq,
                        info.count,
                        info.coverage,
                        info.taxonomy,
                        info.names,
                        info.aligned_lengths,
                        known_tax]
                    otu_table_object.data.append(to_print)
            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences,
                known_taxes,
                False,
                True)
            add_info(known_infos, otu_table_object, True)
            
            if tmp_graft: # if any sequences were aligned (not just already known)
                tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta
                
                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = self._get_windowed_sequences(
                        assignment_result.prealigned_sequence_file(
                            sample_name, singlem_package, tmpbase),
                        assignment_result.nucleotide_hits_file(
                            sample_name, singlem_package, tmpbase),
                        singlem_package,
                        include_inserts)
                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                    else:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                    logging.debug("Reading taxonomy from %s" % tax_file)

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        taxonomies = DiamondResultParser(tax_file)
                        use_first = True
                    else:
                        if not os.path.isfile(tax_file):
                            logging.warn("Unable to find tax file for gene %s from sample %s "
                                         "(likely do to min length filtering), skipping" % (
                                             os.path.basename(singlem_package.base_directory()),
                                             sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)
                        use_first = False
                        
                else: # Taxonomy has not been assigned.
                    aligned_seqs = unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    use_first = False # irrelevant
                    is_known_taxonomy = True
                    
                new_infos = list(self._seqs_to_counts_and_taxonomy(
                    aligned_seqs, taxonomies, use_first, False))
                add_info(new_infos, otu_table_object, is_known_taxonomy)
                
                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir, "placements.jplace")
                    output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % (
                        output_jplace, sample_name, singlem_package.graftm_package_basename()))
                    logging.debug("Converting jplace file %s to singlem jplace file %s" % (
                        input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(
                            open(input_jplace_file), new_infos, output_jplace_io)

                            
        if output_otu_table:
            with open(output_otu_table, 'w') as f:
                if output_extras:
                    otu_table_object.write_to(f, otu_table_object.fields)
                else:
                    otu_table_object.write_to(f, regular_output_fields)
        if archive_otu_table:
            with open(archive_otu_table, 'w') as f:
                otu_table_object.archive(hmms.singlem_packages).write_to(f)
        return_cleanly()
                        required=True)

    args = parser.parse_args()
    if args.debug:
        loglevel = logging.DEBUG
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.INFO
    logging.basicConfig(level=loglevel,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')

    # Read in taxonomy
    logging.info("Reading taxonomy..")
    gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy
    logging.info("Read in %i taxonomies" % len(gg))

    # Read in sequence
    logging.info("Reading sequences..")
    duplicates = set()
    sequences = {}
    for name, seq, _ in SequenceIO()._readfq(open(args.sequences)):
        if name in sequences:
            logging.error("Duplicate sequence name %s" % name)
            duplicates.add(name)
        else:
            sequences[name] = seq
    logging.warn("Found %i duplicated IDs" % len(duplicates))
    for dup in duplicates:
        del sequences[dup]
Exemplo n.º 13
0
    def run_to_otu_table(self, **kwargs):
        '''Run the pipe, '''
        forward_read_files = kwargs.pop('sequences')
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method

        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        if not assign_taxonomy:
            singlem_assignment_method = NO_ASSIGNMENT_METHOD

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug(
                    "Shared memory directory not detected, using default temporary directory instead"
                )
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" %
                                 working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception(
                        "Working directory '%s' already exists, not continuing"
                        % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory
        extracted_reads = None

        def return_cleanly():
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return None
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" %
                          len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(
                open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info(
                "Read in %i taxonomies from the GreenGenes format taxonomy file"
                % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(align_result,
                                                       include_inserts,
                                                       known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with GraftM..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
            package_to_taxonomy_bihash = {}

        for readset in extracted_reads:
            sample_name = readset.sample_name
            singlem_package = readset.singlem_package
            known_sequences = readset.known_sequences

            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(), sample_name,
                        info.seq, info.count, info.coverage, info.taxonomy,
                        info.names, info.aligned_lengths, known_tax
                    ]
                    otu_table_object.data.append(to_print)

            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences, NO_ASSIGNMENT_METHOD, known_taxes,
                known_sequence_taxonomy, None)
            add_info(known_infos, otu_table_object, True)

            if len(
                    readset.unknown_sequences
            ) > 0:  # if any sequences were aligned (not just already known)
                tmpbase = readset.tmpfile_basename

                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = list(
                        itertools.chain(readset.unknown_sequences,
                                        readset.known_sequences))

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                        taxonomies = DiamondResultParser(tax_file)
                    elif singlem_assignment_method == DIAMOND_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                        if not os.path.isfile(tax_file):
                            logging.warn(
                                "Unable to find tax file for gene %s from sample %s "
                                "(likely do to min length filtering), skipping"
                                % (os.path.basename(
                                    singlem_package.base_directory()),
                                   sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)

                    elif singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
                        bihash_key = singlem_package.base_directory()
                        if bihash_key in package_to_taxonomy_bihash:
                            taxonomy_bihash = package_to_taxonomy_bihash[
                                bihash_key]
                        else:
                            taxtastic_taxonomy = singlem_package.graftm_package(
                            ).taxtastic_taxonomy_path()
                            logging.debug(
                                "Reading taxtastic taxonomy from %s" %
                                taxtastic_taxonomy)
                            with open(taxtastic_taxonomy) as f:
                                taxonomy_bihash = TaxonomyBihash.parse_taxtastic_taxonomy(
                                    f)
                            package_to_taxonomy_bihash[
                                bihash_key] = taxonomy_bihash
                        base_dir = assignment_result._base_dir(
                            sample_name, singlem_package, tmpbase)
                        jplace_file = os.path.join(base_dir,
                                                   "placements.jplace")
                        logging.debug(
                            "Attempting to read jplace output from %s" %
                            jplace_file)
                        if os.path.exists(jplace_file):
                            with open(jplace_file) as f:
                                jplace_json = json.loads(f.read())
                            placement_parser = PlacementParser(
                                jplace_json, taxonomy_bihash, 0.5)
                        else:
                            # Sometimes alignments are filtered out.
                            placement_parser = None
                        taxonomies = {}
                    elif singlem_assignment_method == NO_ASSIGNMENT_METHOD:
                        taxonomies = {}
                    else:
                        raise Exception("Programming error")

                else:  # Taxonomy has not been assigned.
                    aligned_seqs = readset.unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    is_known_taxonomy = True

                new_infos = list(
                    self._seqs_to_counts_and_taxonomy(
                        aligned_seqs, singlem_assignment_method,
                        known_sequence_tax if known_sequence_taxonomy else {},
                        taxonomies,
                        placement_parser if singlem_assignment_method
                        == PPLACER_ASSIGNMENT_METHOD else None))
                add_info(new_infos, otu_table_object, is_known_taxonomy)

                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir,
                                                     "placements.jplace")
                    output_jplace_file = "%s_%s_%s.jplace" % (
                        output_jplace, sample_name,
                        singlem_package.graftm_package_basename())
                    logging.info("Writing jplace file '%s'" %
                                 output_jplace_file)
                    logging.debug(
                        "Converting jplace file %s to singlem jplace file %s" %
                        (input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(open(input_jplace_file),
                                                      new_infos,
                                                      output_jplace_io)
        return_cleanly()
        return otu_table_object