示例#1
0
def scaffold_hybrid_tree_foundation_tree(
    otu_map: OtuMapFormat,
    extension_taxonomy: TSVTaxonomyFormat,
    extension_sequences: DNAFASTAFormat,
    foundation_tree: NewickFormat,
    foundation_taxonomy: TSVTaxonomyFormat,
    graft_level: str = _ghost_tree_defaults['graft_level'],
) -> NewickFormat:

    otu_map_fh = otu_map.open()
    extension_taxonomy_fh = extension_taxonomy.open()
    extension_sequences_fh = extension_sequences.open()
    foundation_alignment_fh = foundation_tree.open()
    if foundation_taxonomy:
        foundation_taxonomy_fh = foundation_taxonomy.open()
    else:
        foundation_taxonomy_fh = None

    with tempfile.TemporaryDirectory() as tmp:

        # need ghost_tree.nwk here otherwise file exists
        gt_path = os.path.join(tmp, 'ghost_tree')
        thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh,
                                             extension_sequences_fh,
                                             foundation_alignment_fh, gt_path,
                                             graft_level,
                                             foundation_taxonomy_fh)[0]

        # write new file to tmp file; gets deleted when this block is done
        gt_temp_file = open(tmp + 'ghost_tree', 'w')
        gt_temp_file.write(thetree)
        gt_temp_file.close()

        return NewickFormat(tmp + 'ghost_tree', 'r')
示例#2
0
    def test_tsv_taxonomy_format_validate_positive(self):
        filenames = ['2-column.tsv', '3-column.tsv', 'valid-but-messy.tsv',
                     'many-rows.tsv']
        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                     for filename in filenames]

        for filepath in filepaths:
            format = TSVTaxonomyFormat(filepath, mode='r')

            format.validate()
示例#3
0
    def test_tsv_taxonomy_format_validate_positive(self):
        filenames = ['2-column.tsv', '3-column.tsv', 'valid-but-messy.tsv',
                     'many-rows.tsv']
        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                     for filename in filenames]

        for filepath in filepaths:
            format = TSVTaxonomyFormat(filepath, mode='r')

            format.validate()
示例#4
0
    def test_tsv_taxonomy_format_validate_negative(self):
        filenames = ['empty', 'blanks', '1-column.tsv',
                     'headerless.tsv', 'header-only.tsv', 'jagged.tsv']
        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                     for filename in filenames]

        for filepath in filepaths:
            format = TSVTaxonomyFormat(filepath, mode='r')

            with self.assertRaisesRegex(ValidationError, 'TSVTaxonomy'):
                format.validate()
示例#5
0
    def test_tsv_taxonomy_format_validate_negative(self):
        filenames = ['empty', 'blanks-and-comments', '1-column.tsv',
                     'headerless.tsv', 'header-only.tsv', 'jagged.tsv']
        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                     for filename in filenames]

        for filepath in filepaths:
            format = TSVTaxonomyFormat(filepath, mode='r')

            with self.assertRaisesRegex(ValidationError, 'TSVTaxonomy'):
                format.validate()
示例#6
0
    def test_tsv_taxonomy_format_column_header_lengths(self):
        filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']

        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                     for filename in filenames]

        for filepath in filepaths:
            format = TSVTaxonomyFormat(filepath, mode='r')

            with self.assertRaisesRegex(ValidationError,
                                        'line 2.*3 values.*expected 2'):
                format.validate()
示例#7
0
def _0(ff: TSVTaxonomyFormat) -> skbio.TreeNode:
    root = skbio.TreeNode('root', length=0)
    with ff.open() as fh:
        reader = iter(csv.reader(fh, delimiter='\t'))
        next(reader)  # skip header
        for row in reader:
            id_, taxonomy = row[:2]
            taxonomy = taxonomy.split(';')
            node = root
            for taxon in taxonomy:
                for child in node.children:
                    if child.name == taxon:
                        node = child
                        break
                else:
                    child = skbio.TreeNode(taxon, length=1)
                    node.append(child)
                    node = child

            node.append(skbio.TreeNode(id_, length=1))

    return root
    def setUp(self):
        super().setUp()

        # setup taxonomy to be edited
        tax_fp = self.get_data_path('escherichia_shigella_taxonomy.txt')
        self.taxonomy = TSVTaxonomyFormat(tax_fp, mode='r').view(pd.Series)

        # setup full string replacement
        replc = self.get_data_path('taxonomy-replacement-full-strings.txt')
        md_replc = Metadata.load(replc)
        self.md_replc_col = md_replc.get_column('replacements')

        # setup substring replacement
        ss_replc = self.get_data_path('taxonomy-replacement-pass.txt')
        md_ss_replc = Metadata.load(ss_replc)
        self.md_ss_replc_col = md_ss_replc.get_column('replacements')

        # setup substring regex replacement
        ssr_replc = self.get_data_path('taxonomy-replacement-regex.txt')
        md_ssr_replc = Metadata.load(ssr_replc)
        self.md_ssr_replc_col = md_ssr_replc.get_column('replacements')

        # setup reusable dicts
        self.exp_dict_00 = {
            'Sal01': ('d__SUPER_DUPER_BACTERIA; '
                      'p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__SUPER_DUPER_BACTERIA; '
                      'p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__SUPER_DUPER_BACTERIA; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__uncultured_Salmonella'),
            'Esch01': ('d__SUPER_DUPER_BACTERIA; '
                       'p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__'),
            'Shig01': ('d__SUPER_DUPER_BACTERIA; '
                       'p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__')
        }

        self.exp_dict_01 = {
            'Sal01': ('d__Bacteria; p__LAME-PYHLA; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__LAME-PYHLA; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__LAME-PYHLA; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__UNCIVILIZED_Salmonella'),
            'Esch01': ('d__Bacteria; p__LAME-PYHLA; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__'),
            'Shig01': ('d__Bacteria; p__LAME-PYHLA; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; s__')
        }
        self.exp_dict_02 = {
            'Sal01': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__uncultured_Salmonella'),
            'Esch01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella'),
            'Shig01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella')
        }
        self.exp_dict_03 = {
            'Sal01': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'Sal02': ('d__Bacteria; p__Proteobacteria; '
                      'c__Gammaproteobacteria; '
                      'o__Enterobacterales; '
                      'f__Enterobacteriaceae; '
                      'g__Escherichia-Shigella; '
                      's__Salmonella_enterica'),
            'UncultSal': ('d__Bacteria; '
                          'p__Proteobacteria; '
                          'c__Gammaproteobacteria; '
                          'o__Enterobacterales; '
                          'f__Enterobacteriaceae; '
                          'g__Escherichia-Shigella; '
                          's__UNCIVIL_Salmonella'),
            'Esch01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella'),
            'Shig01': ('d__Bacteria; p__Proteobacteria; '
                       'c__Gammaproteobacteria; '
                       'o__Enterobacterales; '
                       'f__Enterobacteriaceae; '
                       'g__Escherichia-Shigella; '
                       's__unknown_Escherichia-Shigella')
        }