示例#1
0
    def test_sequence_to_fastq_kwargs_passed(self):
        for constructor in [Sequence, DNA, RNA, Protein]:
            for components, kwargs_expected_fp in self.valid_files:
                for expected_kwargs, expected_fp in kwargs_expected_fp:

                    observed_kwargs = {}
                    # TODO:
                    # some of the test files contain characters which are
                    # invalid for RNA, so don't validate for now. Need to
                    # fix this
                    if constructor is RNA:
                        observed_kwargs['validate'] = False

                    expected_kwargs['lowercase'] = 'introns'
                    observed_kwargs['lowercase'] = 'introns'

                    fh = io.StringIO()
                    for c in components:
                        obj = constructor(
                            c[2],
                            metadata={'id': c[0], 'description': c[1]},
                            positional_metadata={'quality': c[3]},
                            **observed_kwargs)
                        write(obj, into=fh, format='fastq', **expected_kwargs)

                    observed = fh.getvalue()
                    fh.close()

                    with io.open(expected_fp) as f:
                        expected = f.read()

                    self.assertEqual(observed, expected)
示例#2
0
def _filter_sequence_ids(in_fp, out_fp, ids, negate=False):
    '''Filter away the seq with specified IDs.'''
    with open(out_fp, 'w') as out:
        for seq in read(in_fp, format='fasta', constructor=Sequence):
            seq_id = seq.metadata['id']
            if seq_id not in ids:
                write(seq, format='fasta', into=out)
示例#3
0
def create_faa(seqs, out, genetic_code=11):
    '''Create protein sequence file.

    It creates protein sequences based on the interval features
    with type of "CDS".

    Parameters
    ----------
    seqs : iterable of ``Sequence``
        The list of DNA/RNA sequences
    out : file object
        File object for output
    genetic_code : int
        The fallback genetic code to use
    '''
    for seq in seqs:
        for cds in seq.interval_metadata.query(metadata={'type': 'CDS'}):
            fna = DNA.concat([seq[start:end] for start, end in cds.bounds])
            if cds.metadata.get('strand', '.') == '-':
                fna = fna.reverse_complement()
            try:
                # if translation table is not available in metadata, fallback
                # to what is specified in the func parameter
                faa = fna.translate(
                    cds.metadata.get('transl_table', genetic_code))
                faa.metadata['description'] = cds.metadata.get('product', '')
                # CDS metadata must have key of 'ID'
                faa.metadata['id'] = cds.metadata['ID']
                write(faa, into=out, format='fasta')
            except NotImplementedError:
                logger.warning(
                    'This gene has degenerate nucleotide and will not be translated.'
                )
示例#4
0
 def test_annotate(self):
     config = {
         'structural_annotation': {
             'minced': {
                 'params': '',
                 'priority': 50,
                 'output': 'minced',
                 'threads': 1
             },
             'prodigal': {
                 'params': '-p meta -f gff',
                 'priority': 90,
                 'output': 'prodigal',
                 'threads': 1
             }
         },
         'protein': {},
         'bacteria': {},
         'general': {
             'metadata': 'foo.sqlite'
         }
     }
     config_fp = join(self.tmpd, 'config.yaml')
     with open(config_fp, 'w') as f:
         yaml.dump(config, f, default_flow_style=True)
     write(DNA('ATGC', {'id': 'seq1'}), into=self.i, format='fasta')
     annotate(self.i, 'fasta', 1, self.tmpd, 'gff3', 11, 'bacteria',
              'metagenome', (), 1, True, False, True, config_fp)
     output = join(self.tmpd, splitext(self.i)[0])
     self.assertTrue(exists(output + '.fna'))
     self.assertTrue(exists(output + '.gff3'))
示例#5
0
    def test_sequence_to_fastq_kwargs_passed(self):
        for constructor in [Sequence, DNA, RNA, Protein]:
            for components, kwargs_expected_fp in self.valid_files:
                for expected_kwargs, expected_fp in kwargs_expected_fp:

                    observed_kwargs = {}
                    # TODO:
                    # some of the test files contain characters which are
                    # invalid for RNA, so don't validate for now. Need to
                    # fix this
                    if constructor is RNA:
                        observed_kwargs['validate'] = False

                    expected_kwargs['lowercase'] = 'introns'
                    observed_kwargs['lowercase'] = 'introns'

                    fh = io.StringIO()
                    for c in components:
                        obj = constructor(
                            c[2],
                            metadata={'id': c[0], 'description': c[1]},
                            positional_metadata={'quality': c[3]},
                            **observed_kwargs)
                        write(obj, into=fh, format='fastq', **expected_kwargs)

                    observed = fh.getvalue()
                    fh.close()

                    with io.open(expected_fp) as f:
                        expected = f.read()

                    self.assertEqual(observed, expected)
示例#6
0
文件: cmscan2.py 项目: RNAer/dumpling
def scan_seq(seq, db, cpu=1, params=None):
    if params is None:
        params = {}
    params['--cpu'] = cpu
    app = CMScan(InputHandler='_input_as_paths', params=params)
    with NamedTemporaryFile(mode='w+') as i:
        write(seq, into=i.name, format='fasta')
        return app([db, i.name])
示例#7
0
    def test_filter_partial_genes(self):
        in_fp = join(self.tmpd, 'in.gff')
        out_fp = join(self.tmpd, 'out.gff')
        imd1 = IntervalMetadata(None)
        imd1.add(
            [(0, 100)],
            metadata={
                'partial': '01',
                'phase': 0,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })
        imd2 = IntervalMetadata(None)
        imd2.add(
            [(200, 300)],
            metadata={
                'partial': '10',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '-',
                'type': 'CDS',
                'score': '1'
            })
        imd2.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        imd3 = IntervalMetadata(None)
        imd3.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        data = (('seq1', imd1), ('seq2', imd2))
        write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3')
        filter_partial_genes(in_fp, out_fp)
        obs = read(out_fp, format='gff3')
        for i, j in zip(obs, [('seq2', imd3)]):
            self.assertEqual(i, j)
示例#8
0
def filter_alignment_positions(aligned_sequences_file: AlignedDNAFASTAFormat,
                               maximum_gap_frequency: str,
                               maximum_position_entropy: str) -> \
        AlignedDNAFASTAFormat:
    aligned_sequences_fh = aligned_sequences_file.open()

    fasta_file = AlignedDNAFASTAFormat()

    skbio.write(filter_positions(aligned_sequences_fh, maximum_gap_frequency,
                                 maximum_position_entropy),
                into=str(fasta_file),
                format='fasta')

    return fasta_file
示例#9
0
def convert(in_f, in_fmt, out_f, out_fmt):
    '''convert between file formats

    Parameters
    ----------
    in_fmt : str
        input file format
    out_fmt : str
        output file format
    in_f : str
        input file path
    out_f: str
        output file path
    '''
    for obj in read(in_f, format=in_fmt):
        write(obj, format=out_fmt, into=out_f)
示例#10
0
def extract_fungi(
        aligned_silva_file: AlignedDNAFASTAFormat,
        accession_file: SilvaAccessionFormat,
        taxonomy_file: SilvaTaxonomyFormat,
        ) -> AlignedDNAFASTAFormat:

    aligned_silva_fh = aligned_silva_file.open()
    accession_fh = accession_file.open()
    taxonomy_fh = taxonomy_file.open()

    fasta_file = AlignedDNAFASTAFormat()
    skbio.write(fungi_from_fasta(aligned_silva_fh, accession_fh,
                taxonomy_fh), into=str(fasta_file), format='fasta')

    # TODO this code is a good example of pithy return for plugins
    # TODO redo other functions in the same way by instantiating a

    return fasta_file
示例#11
0
    def test_sequence_to_fastq_kwargs_passed(self):
        for constructor in [BiologicalSequence, NucleotideSequence,
                            DNASequence, RNASequence, ProteinSequence]:
            for components, kwargs_expected_fp in self.valid_files:
                for kwargs, expected_fp in kwargs_expected_fp:
                    fh = StringIO()
                    for c in components:
                        obj = constructor(c[2], id=c[0], description=c[1],
                                          quality=c[3])
                        write(obj, into=fh, format='fastq', **kwargs)

                    observed = fh.getvalue()
                    fh.close()

                    with open(expected_fp, 'U') as f:
                        expected = f.read()

                    self.assertEqual(observed, expected)
示例#12
0
    def setUp(self):
        self.test_dir = abspath(
            join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100'))
        files = [
            'Swiss-Prot_Archaea.fna', 'Swiss-Prot_Bacteria.fna',
            'Swiss-Prot_Eukaryota.fna', 'Swiss-Prot_Viruses.fna',
            'TrEMBL_Archaea.fna', 'TrEMBL_Bacteria.fna',
            'TrEMBL_Eukaryota.fna', 'TrEMBL_Viruses.fna'
        ]
        files = [join(self.test_dir, f) for f in files]
        self.tmp = mkdtemp()
        self.test1 = join(self.tmp, 'test1.fna')
        self.test1_exp = 'test1.genbank'
        with open(self.test1, 'w') as f:
            for seq in read(files[1], format='fasta'):
                write(seq, format='fasta', into=f)

        self.obs_tmp = mkdtemp()
示例#13
0
    def setUp(self):
        self.test_dir = abspath(
            join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100'))
        files = [
            'Swiss-Prot_Archaea.fna',
            'Swiss-Prot_Bacteria.fna',
            'Swiss-Prot_Eukaryota.fna',
            'Swiss-Prot_Viruses.fna',
            'TrEMBL_Archaea.fna',
            'TrEMBL_Bacteria.fna',
            'TrEMBL_Eukaryota.fna',
            'TrEMBL_Viruses.fna']
        files = [join(self.test_dir, f) for f in files]
        self.tmp = mkdtemp()
        self.test1 = join(self.tmp, 'test1.fna')
        self.test1_exp = 'test1.genbank'
        with open(self.test1, 'w') as f:
            for seq in read(files[1], format='fasta'):
                write(seq, format='fasta', into=f)

        self.obs_tmp = mkdtemp()
示例#14
0
    def test_sequence_to_fastq_kwargs_passed(self):
        for constructor in [Sequence, partial(DNA, validate=False),
                            partial(RNA, validate=False),
                            partial(Protein, validate=False)]:
            for components, kwargs_expected_fp in self.valid_files:
                for kwargs, expected_fp in kwargs_expected_fp:
                    fh = StringIO()
                    for c in components:
                        obj = constructor(
                            c[2],
                            metadata={'id': c[0], 'description': c[1]},
                            positional_metadata={'quality': c[3]})
                        write(obj, into=fh, format='fastq', **kwargs)

                    observed = fh.getvalue()
                    fh.close()

                    with open(expected_fp, 'U') as f:
                        expected = f.read()

                    self.assertEqual(observed, expected)
示例#15
0
    def test_sequence_to_fastq_kwargs_passed(self):
        for constructor in [
                BiologicalSequence, NucleotideSequence, DNASequence,
                RNASequence, ProteinSequence
        ]:
            for components, kwargs_expected_fp in self.valid_files:
                for kwargs, expected_fp in kwargs_expected_fp:
                    fh = StringIO()
                    for c in components:
                        obj = constructor(c[2],
                                          id=c[0],
                                          description=c[1],
                                          quality=c[3])
                        write(obj, into=fh, format='fastq', **kwargs)

                    observed = fh.getvalue()
                    fh.close()

                    with open(expected_fp, 'U') as f:
                        expected = f.read()

                    self.assertEqual(observed, expected)
示例#16
0
    def test_filter_sequence_ids(self):
        seqs = [
            Sequence('A', {
                'id': 'seq1',
                'description': ''
            }),
            Sequence('T', {
                'id': 'seq2',
                'description': ''
            })
        ]

        ifile = join(self.tmpd, 'in.fna')
        write((i for i in seqs), into=ifile, format='fasta')
        ofile = join(self.tmpd, 'out.fna')

        idss = [('foo'), {'seq1'}, ('seq2'), {'seq1', 'seq2'}]
        exps = [seqs, seqs[1:], seqs[:-1], []]

        for ids, exp in zip(idss, exps):
            _filter_sequence_ids(ifile, ofile, ids)
            obs = list(read(ofile, constructor=Sequence, format='fasta'))
            self.assertEqual(obs, exp)
示例#17
0
def pick_otus(file_path):
    outdir = os.path.join(os.path.dirname(file_path), 'uclust')
    if False:  ## Making fasta format compatible with qiime (for some reason not working, assume user provides it)
        import skbio  ## Im using scikit-bio for fasta I/O (comes with qiime)
        from skbio.sequence import BiologicalSequence
        print "Preprocessing FASTA " + file_path
        file_path1 = '%s_1%s' % tuple(os.path.splitext(file_path))
        outfile = open(file_path1, "w")  ## reformatting fasta
        fastafile = skbio.read(file_path, format='fasta')
        print "Reading " + file_path
        print "File handle: " + str(fastafile)
        for seqcount, rec in enumerate(fastafile):
            print seqcount + rec.__repr__()
            try:
                int(
                    rec.id.split('_')[1]
                )  ## if the sequence adheres to qiime's expected format <sample_id>_<seq_counter>
                skbio.write(rec, 'fasta',
                            outfile)  ## write down the record as is
            except ValueError, IndexError:  ## else: enforce an id format compatible with qiime's otu picker
                rec1 = BiologicalSequence(rec.sequence, "User_%05d" % seqcount)
                skbio.write(rec1, 'fasta', outfile)
        outfile.close()
        file_path = file_path1
示例#18
0
def convert(in_fmt, out_fmt, in_f, out_f):
    '''convert between file formats'''
    for obj in read(in_f, format=in_fmt):
        write(obj, format=out_fmt, into=out_f)
示例#19
0
def integrate(seq_fp,
              annot_dir,
              protein_xref,
              out_fp,
              quality=False,
              out_fmt='gff3'):
    '''integrate all the annotations and write to disk.

    Parameters
    ----------
    seq_fn : str
        input seq file name.
    out_dir : str
        annotation output directory.
    out_fmt : str
        output format

    Returns
    -------
    dict
        key is the str of seq_id and ``Sequence`` objects
    '''
    logger.info('Integrate annotation for output')
    seqs = {}
    for seq in read(seq_fp, format='fasta'):
        seqs[seq.metadata['id']] = seq

    rules = {
        splitext(f)[0]
        for f in os.listdir(annot_dir) if f.endswith('.ok')
    }
    if 'diamond' in rules:
        rules.discard('diamond')
        mod = import_module('.diamond', module.__name__)
        diamond = mod.Module(directory=annot_dir)
        diamond.parse(metadata=protein_xref)
        protein = diamond.result
    else:
        protein = {}
    for rule in rules:
        logger.debug('parse the result from %s output' % rule)
        mod = import_module('.%s' % rule, module.__name__)
        obj = mod.Module(directory=annot_dir)
        obj.parse()
        for seq_id, imd in obj.result.items():
            seq = seqs[seq_id]
            imd._upper_bound = len(seq)
            if rule == 'prodigal':
                cds_metadata = protein.get(seq_id, {})
                _add_cds_metadata(seq_id, imd, cds_metadata)
            seq.interval_metadata.merge(imd)

    # write out the annotation
    if out_fmt == 'genbank':
        with open(out_fp, 'w') as out:
            for sid, seq in seqs.items():
                seq.metadata['LOCUS'] = {
                    'locus_name': sid,
                    'size': len(seq),
                    'unit': 'bp',
                    'mol_type': 'DNA',
                    'shape': 'linear',
                    'division': None,
                    'date': strftime("%d-%b-%Y", gmtime())
                }
                seq.metadata['ACCESSION'] = ''
                seq.metadata['VERSION'] = ''
                seq.metadata['KEYWORDS'] = '.'
                seq.metadata['SOURCE'] = {
                    'ORGANISM': 'genus species',
                    'taxonomy': 'unknown'
                }
                seq.metadata['COMMENT'] = 'Annotated with %s %s' % (
                    __package__, __version__)
                write(seq, into=out, format=out_fmt)
    elif out_fmt == 'gff3':
        write(((sid, seq.interval_metadata) for sid, seq in seqs.items()),
              into=out_fp,
              format=out_fmt)
    else:
        raise ValueError('Unknown specified output format: %r' % out_fmt)

    return seqs
示例#20
0
def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str,
                  r_primer: str, trunc_len: int = 0, trim_left: int = 0,
                  identity: float = 0.8, min_length: int = 50,
                  max_length: int = 0, n_jobs: int = 1,
                  batch_size: int = 'auto') -> DNAFASTAFormat:
    """Extract the read selected by a primer or primer pair. Only sequences
    which match the primers at greater than the specified identity are returned

    Parameters
    ----------
    sequences : DNASequencesDirectoryFormat
        An aligned list of skbio.sequence.DNA query sequences
    f_primer : skbio.sequence.DNA
        Forward primer sequence
    r_primer : skbio.sequence.DNA
        Reverse primer sequence
    trunc_len : int, optional
        Read is cut to trunc_len if trunc_len is positive. Applied before
        trim_left.
    trim_left : int, optional
        `trim_left` nucleotides are removed from the 5' end if trim_left is
        positive. Applied after trunc_len.
    identity : float, optional
        Minimum combined primer match identity threshold. Default: 0.8
    min_length: int, optional
        Minimum amplicon length. Shorter amplicons are discarded. Default: 50
    max_length: int, optional
        Maximum amplicon length. Longer amplicons are discarded.
    n_jobs: int, optional
        Number of seperate processes to break the task into.
    batch_size: int, optional
        Number of samples to be processed in one batch.
    Returns
    -------
    q2_types.DNAFASTAFormat
        containing the reads
    """
    if min_length > trunc_len - trim_left and trunc_len > 0:
        raise ValueError('The minimum length setting is greater than the '
                         'length of the truncated sequences. This will cause '
                         'all sequences to be removed from the dataset. To '
                         'proceed, set a min_length ≤ trunc_len - trim_left.')
    n_jobs = effective_n_jobs(n_jobs)
    if batch_size == 'auto':
        batch_size = _autotune_reads_per_batch(
            sequences.file.view(DNAFASTAFormat), n_jobs)
    sequences = sequences.file.view(DNAIterator)
    ff = DNAFASTAFormat()
    with open(str(ff), 'a') as fh:
        with Parallel(n_jobs) as parallel:
            for chunk in _chunks(sequences, batch_size):
                amplicons = parallel(delayed(_gen_reads)(sequence, f_primer,
                                                         r_primer, trunc_len,
                                                         trim_left, identity,
                                                         min_length,
                                                         max_length)
                                     for sequence in chunk)
                for amplicon in amplicons:
                    if amplicon is not None:
                        skbio.write(amplicon, format='fasta', into=fh)
    if os.stat(str(ff)).st_size == 0:
        raise RuntimeError("No matches found")
    return ff
示例#21
0
文件: cmscan.py 项目: RNAer/dumpling
def scan_seq(seq, db, cpu=1, **kwargs):
    cmscan = Dumpling('cmscan', params=Parameters(*_params))
    with NamedTemporaryFile(mode='w+') as i:
        write(seq, into=i.name, format='fasta')
        cmscan.update(query=i.name, db=db, **kwargs)
        return cmscan()
示例#22
0
def annotate(in_fp, in_fmt, min_len, out_dir, out_fmt, gcode, kingdom, mode,
             task, cpus, force, dry_run, quality, config):
    '''Annotate the sequences in the input file.

    Parameters
    ----------
    in_fp : str
        Input seq file name
    in_fmt : str
        Input file format
    min_len : int
        The threshold of seq length to be filtered away
    out_dir : str
        Output file directory.
    out_fmt : str
        Output file format
    gcode : int
        The translation table to use for protein-coding genes
    mode : bool
        Run with metagenomic mode?
    kingdom : str
        The kingdom where the sequences are from
    cpus : int
        Number of cpus to use.
    force : bool
        Force to overwrite.
    dry_run : bool
    config : config file for snakemake
    '''
    logger.debug('working dir: %s' % out_dir)
    if force:
        logger.debug('run in force mode - will overwrite existing files.')
    if dry_run:
        logger.debug('run in dry mode - will not produce output.')

    ## prepare the file paths
    os.makedirs(out_dir, exist_ok=True)
    prefix, suffix = splitext(basename(in_fp))
    if suffix in {'.gz', '.bz2'}:
        prefix = splitext(prefix)[0]
    out_prefix = join(out_dir, prefix)
    seq_fp = abspath(out_prefix + '.fna')

    ## validate and filter the input seq file
    if exists(seq_fp):
        # do not overwrite because all the snakemake steps will be rerun when
        # this file is updated.
        logger.debug(
            'the filtered sequence file already exists. skip validating step.')
    else:
        ids = set()
        with open(seq_fp, 'w') as out:
            for seq in check_seq(in_fp, in_fmt, lambda s: len(s) < min_len):
                write(seq, format='fasta', into=out)

    ## prepare snakemake workflow
    snakefile = resource_filename(__package__, 'Snakefile')
    if config is None:
        config = resource_filename(__package__, kingdom + '.yaml')
    logger.debug('set annotation in %s mode.' % mode)
    logger.debug('set annotation as %s.' % kingdom)
    logger.debug('use config file: %s.' % config)
    with open(config) as fh:
        cfg = yaml.load(fh)

    general = cfg.pop('general', {})
    rules = {}
    if not task:
        task = [i for i in cfg]
    for k, v in cfg.items():
        # specify the annotation task
        if k in task:
            if v is not None:
                for vk, vv in v.items():
                    if vk in rules:
                        raise ValueError(
                            'You have multiple config for rule %s' % vk)
                    rules[vk] = vv

    ## update the parameters of relevant tools with options from cmd line
    if 'prodigal' in rules:
        param = '%s -g %d' % (rules['prodigal']['params'], gcode)
        if mode == 'finished':
            param = '-p single -c ' + param
        elif mode == 'draft':
            param = '-p single ' + param
        elif mode == 'metagenome':
            param = '-p meta ' + param
        rules['prodigal']['params'] = param
    if 'aragorn' in rules:
        rules['aragorn']['params'] = '%s -gc%d' % (rules['aragorn']['params'],
                                                   gcode)
    if 'rnammer' in rules:
        rules['rnammer']['params'] = '-S %s %s' % (kingdom[:3],
                                                   rules['rnammer']['params'])

    # only run the targets specified in the yaml file
    targets = list(rules.keys())
    if not targets:
        logger.warning('No annotation task to run')
        return
    rules['seq'] = seq_fp

    cfg_file = join(out_dir, 'snakemake.yaml')
    with open(cfg_file, 'w') as out:
        yaml.dump(rules, out, default_flow_style=False)

    logger.debug('run snakemake workflow')
    success = snakemake(
        snakefile,
        cores=cpus,
        targets=targets,
        # set work dir to output dir so simultaneous runs
        # doesn't interfere with each other.
        workdir=out_prefix,
        printshellcmds=True,
        dryrun=dry_run,
        forceall=force,
        # config=cfg,
        configfile=cfg_file,
        keep_target_files=True,
        # provide this dummy to suppress unnecessary log
        log_handler=lambda s: None,
        quiet=True,  # do not print job info
        keep_logger=False)

    if success:
        # if snakemake finishes successfully
        out_fp = '%s.%s' % (out_prefix, out_fmt)
        protein_xref = general.get('protein_xref')
        if protein_xref is not None:
            protein_xref = expanduser(protein_xref)
        seqs = integrate(seq_fp, out_prefix, protein_xref, out_fp, out_fmt)

        logger.info('Write summary of the annotation')
        with open(out_prefix + '.summary.txt', 'w') as out:
            summarize(seqs.values(), out)
        if mode != 'metagenome' and quality is True:
            with open(out_prefix + '.quality.txt', 'w') as out:
                if mode == 'finish':
                    contigs = False
                else:
                    contigs = True
                seq_score = compute_seq_score(seqs.values(), contigs)
                trna_score = rrna_score = gene_score = np.nan
                if 'tRNA' in task:
                    trna_score = compute_trna_score(
                        (i.interval_metadata for i in seqs.values()))
                if 'rRNA' in task:
                    rrna_score = compute_rrna_score(
                        (i.interval_metadata for i in seqs.values()))
                if 'CDS' in task:
                    gene_score = compute_gene_score(faa_fp)
                out.write(
                    '# seq_score: %.2f  tRNA_score: %.2f  rRNA_score: %.2f  gene_score: %.2f\n'
                    % (seq_score, trna_score, rrna_score, gene_score))
    else:
        logger.error('The snakemake run failed.')

    logger.info('Done with annotation')
示例#23
0
文件: util.py 项目: biocore/micronota
def convert(in_fmt, out_fmt, in_f, out_f):
    '''convert between file formats'''
    for obj in read(in_f, format=in_fmt):
        write(obj, format=out_fmt, into=out_f)
示例#24
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    ghost_tree_output = str(ghost_tree_fp)
    ghost_tree_output = ghost_tree_output[16:-4]
    process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w")
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh,
                extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta",
                                            logfile)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" +
                                       " -out" +
                                       " tmp/mini_alignment_gt.fasta -quiet" +
                                       " -maxiters 2 -diags1", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            logfile.write("FastTree warnings for genus "+key_node+" are:\n" +
                          error + "\n")
            mini_tree = read("tmp/mini_tree_gt.nwk", format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_fp.write(str(foundation_tree))
    logfile.close()
    return str(foundation_tree).strip()
示例#25
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    os.system("mkdir tmp")
    global foundation_accession_genus_dic
    foundation_accession_genus_dic = {}
    global seqs
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta")
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            os.system("muscle -in tmp/mini_seq_gt.fasta -out" +
                      " tmp/mini_alignment_gt.fasta -quiet" +
                      " -maxiters 2 -diags1")
            os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" +
                      " tmp/mini_tree_gt.nwk")
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.children[:])
        except:
            continue
    os.system("rm -r tmp")
    ghost_tree_fp.write(str(foundation_tree))
    return str(foundation_tree).strip()
示例#26
0
def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh,
                               extension_seq_fh, foundation_alignment_fh,
                               ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fp : folder
        Output folder contains files including:
        a) The Newick formatted ghost-tree, which is the final output of the
           ghost-tree tool. This is a phylogenetic tree designed for
           downstream diversity analyses.
        b) Accession IDs from the ghost-tree.nwk file that you can use for
           downstream analyses tools
        c) log error file (this is an optional file that you can have if you
           type '--stderr')
    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    std_output, std_error = "", ""
    process = subprocess.Popen("muscle",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    if re.search("command not found", std_error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    std_output, std_error = "", ""
    if re.search("command not found", std_error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    os.mkdir(ghost_tree_fp)
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree, all_std_error = _make_foundation_tree(
        ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error,
        ghost_tree_fp)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen(
                "muscle -in tmp/mini_seq_gt.fasta" + " -out" +
                " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk",
                                       shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n"
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w")
    ghost_tree_nwk.write(str(foundation_tree))
    ghost_tree_nwk.close()
    _make_accession_id_file(ghost_tree_fp)
    return str(foundation_tree).strip(), all_std_error
示例#27
0
    os.mkdir('out/')

for label in os.listdir('../iqtree_GTR/out/'):
    trees = []
    for file in filter(lambda x: x.endswith('.treefile'),
                       os.listdir(f'../iqtree_GTR/out/{label}/')):
        tree = skbio.read(f'../iqtree_GTR/out/{label}/{file}', 'newick',
                          skbio.TreeNode)
        outgroup = tree.find('sleb').ancestors()[0]
        tree = tree.root_at(outgroup)
        trees.append(tree)
    ctree = majority_consensus(trees)
    for node in ctree.traverse():
        node.children = sorted(node.children,
                               key=lambda x: len(list(x.tips())))
    skbio.write(ctree, 'newick', f'out/{label}.txt')

    # Save image as PNG
    fig, ax = plot_tree(ctree, tip_fontsize=8.5)
    ax.yaxis.set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_xlabel('')
    plt.savefig(f'out/{label}.png')
    plt.close()

    # Save image as with supports
    for node in ctree.traverse():
        if node.support == 1:
            node.support = None
    fig, ax = plot_tree(ctree,
                        tip_fontsize=8.5,