Python Ranks 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: recentrifuge.rank

클래스/타입: Ranks

hotexamples.com에서의 예제들: 5

Python Ranks - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 recentrifuge.rank.Ranks에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Ranks(4)

clear(1)

자주 사용되는 메소드들

Ranks (4)

clear (1)

예제 #1

파일 보기

    def __init__(self, init: List[str] = None) -> None:
        """Initialize data structures

        Individual options: 'counts', 'ranks', 'scores', 'accs',
          'shared_counts', 'shared_scores'
        Group options: 'all' initialize all non-shared, and
           'shared' initializes both counts and scores with
            SharedCounter."""
        self.counts: Optional[UnionCounter] = None
        self.ranks: Optional[Ranks] = None
        self.scores: Optional[UnionScores] = None
        self.accs: Optional[Counter[Id]] = None
        if init is None:
            return
        if 'counts' in init or 'all' in init:
            self.counts = col.Counter()
        if 'ranks' in init or 'all' in init:
            self.ranks = Ranks({})
        if 'scores' in init or 'all' in init:
            self.scores = Scores({})
        if 'accs' in init or 'all' in init:
            self.accs = col.Counter()
        if 'shared_counts' in init or 'shared' in init:
            self.counts = SharedCounter()
        if 'shared_scores' in init or 'shared' in init:
            self.scores = SharedCounter()

예제 #2

파일 보기

    def __init__(
            self,
            nodes_file: Filename,
            names_file: Filename,
            plasmid_file: Filename = None,
            collapse: bool = True,
            excluding: Union[Tuple, Set[Id]] = (),
            including: Union[Tuple, Set[Id]] = (),
            debug: bool = False,
    ) -> None:

        # Type data declaration and initialization
        self.ROOT = ROOT
        self.parents: Parents = Parents({})
        self.ranks: Ranks = Ranks({})
        self.names: Names = Names({})
        self.children: Children = Children({})
        self.collapse: bool = collapse
        self.debug: bool = debug

        # Initialization methods
        self.read_nodes(nodes_file)
        self.read_names(names_file)
        if plasmid_file:
            self.read_plasmids(plasmid_file)
        self.build_children()

        # Show explicitly included and excluded taxa
        if including:
            print('List of taxa (and below) to be explicitly included:')
            print('\t\tId\tScientific Name')
            for taxid in including:
                print(f'\t\t{taxid}\t{self.names[taxid]}')
        else:
            # For excluding to operate not on single taxa but on subtrees
            including = {ROOT}
        self.including: Union[Tuple, Set[Id]] = including
        if excluding:
            print('List of taxa (and below) to be excluded:')
            print('\t\tId\tScientific Name')
            for taxid in excluding:
                print(f'\t\t{taxid}\t{self.names[taxid]}')
        self.excluding: Union[Tuple, Set[Id]] = excluding

예제 #3

파일 보기

파일: trees.py 프로젝트: pamag/recentrifuge

class SampleDataByTaxId(object):
    """Typical data in a sample ordered by taxonomical id"""
    def __init__(self, init: List[str] = None) -> None:
        """Initialize data structures

        Individual options: 'counts', 'ranks', 'scores', 'accs',
          'shared_counts', 'shared_scores'
        Group options: 'all' initialize all non-shared, and
           'shared' initializes both counts and scores with
            SharedCounter."""
        self.counts: UnionCounter = None
        self.ranks: Ranks = None
        self.scores: UnionScores = None
        self.accs: Counter[TaxId] = None
        if 'counts' in init or 'all' in init:
            self.counts = Counter()
        if 'ranks' in init or 'all' in init:
            self.ranks = Ranks({})
        if 'scores' in init or 'all' in init:
            self.scores = Scores({})
        if 'accs' in init or 'all' in init:
            self.accs = Counter()
        if 'shared_counts' in init or 'shared' in init:
            self.counts = SharedCounter()
        if 'shared_scores' in init or 'shared' in init:
            self.scores = SharedCounter()

    def set(self,
            counts: UnionCounter = None,
            ranks: Ranks = None,
            scores: UnionScores = None,
            accs: Counter[TaxId] = None) -> None:
        """Set the data fields"""
        if counts is not None:
            self.counts = counts
        if ranks is not None:
            self.ranks = ranks
        if scores is not None:
            self.scores = scores
        if accs is not None:
            self.accs = accs

    def get_counts(self) -> Counter[TaxId]:
        """Get (non shared) counts"""
        if isinstance(self.counts, Counter):
            return self.counts
        raise TypeError

    def get_shared_counts(self) -> SharedCounter:
        """Get shared counts"""
        if isinstance(self.counts, SharedCounter):
            return self.counts
        raise TypeError

    def get_scores(self) -> Scores:
        """Get (non shared) scores"""
        if isinstance(self.scores, dict):
            return self.scores  # type: ignore
        raise TypeError

    def get_shared_scores(self) -> SharedCounter:
        """Get shared scores"""
        if isinstance(self.scores, SharedCounter):
            return self.scores
        raise TypeError

    def get_accs(self) -> Counter[TaxId]:
        """Get accumulated counter"""
        if isinstance(self.accs, Counter):
            return self.accs
        raise TypeError

    def clear(self, fields: List[str] = None) -> None:
        """Clear the data field"""
        if 'counts' in fields or 'all' in fields and self.counts is not None:
            self.counts.clear()
        if 'ranks' in fields or 'all' in fields and self.ranks is not None:
            self.ranks.clear()
        if 'scores' in fields or 'all' in fields and self.scores is not None:
            self.scores.clear()
        if 'accs' in fields or 'all' in fields and self.accs is not None:
            self.accs.clear()

    def purge_counters(self) -> None:
        """Purge elements with zero counts in counters"""
        if isinstance(self.counts, Counter):
            self.counts = +self.counts  # pylint: disable=E1130
        if isinstance(self.accs, Counter):
            self.accs = +self.accs  # pylint: disable=E1130

    def get_taxlevels(self) -> TaxLevels:
        """Get TaxLevels (taxids of ranks) from Ranks (rank of taxids)"""
        if self.ranks:
            return Rank.ranks_to_taxlevels(self.ranks)
        return NotImplemented

예제 #4

파일 보기

def main():
    """Main entry point to script."""
    # Argument Parser Configuration
    parser = argparse.ArgumentParser(
        description='Extract reads following Centrifuge/Kraken output',
        epilog=f'%(prog)s  - {__author__} - {__date__}')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=f'%(prog)s release {__version__} ({__date__})')
    parser.add_argument('-f',
                        '--file',
                        action='store',
                        metavar='FILE',
                        required=True,
                        help='Centrifuge output file.')
    parser.add_argument('-l',
                        '--limit',
                        action='store',
                        metavar='NUMBER',
                        type=int,
                        default=None,
                        help=('Limit of FASTQ reads to extract. '
                              'Default: no limit'))
    parser.add_argument(
        '-m',
        '--maxreads',
        action='store',
        metavar='NUMBER',
        type=int,
        default=None,
        help=('Maximum number of FASTQ reads to search for the taxa. '
              'Default: no maximum'))
    parser.add_argument(
        '-n',
        '--nodespath',
        action='store',
        metavar='PATH',
        default=TAXDUMP_PATH,
        help=('path for the nodes information files (nodes.dmp and names.dmp' +
              ' from NCBI'))
    parser.add_argument(
        '-i',
        '--include',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to include a taxon and all underneath ' +
              '(multiple -i is available to include several taxid). ' +
              'By default all the taxa is considered for inclusion.'))
    parser.add_argument(
        '-x',
        '--exclude',
        action='append',
        metavar='TAXID',
        type=TaxId,
        default=[],
        help=('NCBI taxid code to exclude a taxon and all underneath ' +
              '(multiple -x is available to exclude several taxid)'))
    parser.add_argument(
        '-y',
        '--minscore',
        action='store',
        metavar='NUMBER',
        type=lambda txt: Score(float(txt)),
        default=None,
        help=('minimum score/confidence of the classification of a read '
              'to pass the quality filter; all pass by default'))
    filein = parser.add_mutually_exclusive_group(required=True)
    filein.add_argument('-q',
                        '--fastq',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Single FASTQ file (no paired-ends)')
    filein.add_argument('-1',
                        '--mate1',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 1s '
                        '(filename usually includes _1)')
    parser.add_argument('-2',
                        '--mate2',
                        action='store',
                        metavar='FILE',
                        default=None,
                        help='Paired-ends FASTQ file for mate 2s '
                        '(filename usually includes _2)')

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    args = parser.parse_args()
    output_file = args.file
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    excluding: Set[TaxId] = set(args.exclude)
    including: Set[TaxId] = set(args.include)
    fastq_1: Filename
    fastq_2: Filename = args.mate2
    if not fastq_2:
        fastq_1 = args.fastq
    else:
        fastq_1 = args.mate1

    # Load NCBI nodes, names and build children
    plasmidfile: Filename = None
    ncbi: Taxonomy = Taxonomy(nodesfile, namesfile, plasmidfile, False,
                              excluding, including)

    # Build taxonomy tree
    print(gray('Building taxonomy tree...'), end='')
    sys.stdout.flush()
    tree = TaxTree()
    tree.grow(taxonomy=ncbi, look_ancestors=False)
    print(green(' OK!'))

    # Get the taxa
    print(gray('Filtering taxa...'), end='')
    sys.stdout.flush()
    ranks: Ranks = Ranks({})
    tree.get_taxa(ranks=ranks, include=including, exclude=excluding)
    print(green(' OK!'))
    taxids: Set[TaxId] = set(ranks)
    taxlevels: TaxLevels = Rank.ranks_to_taxlevels(ranks)
    num_taxlevels = Counter({rank: len(taxlevels[rank]) for rank in taxlevels})
    num_taxlevels = +num_taxlevels

    # Statistics about including taxa
    print(f'  {len(taxids)}\033[90m taxid selected in \033[0m', end='')
    print(f'{len(num_taxlevels)}\033[90m different taxonomical levels:\033[0m')
    for rank in num_taxlevels:
        print(f'  Number of different {rank}: {num_taxlevels[rank]}')
    assert taxids, red('ERROR! No taxids to search for!')

    # Get the records
    records: List[SeqRecord] = []
    num_seqs: int = 0
    # timing initialization
    start_time_load: float = time.perf_counter()
    print(gray(f'Loading output file {output_file}...'), end='')
    sys.stdout.flush()
    try:
        with open(output_file, 'rU') as file:
            file.readline()  # discard header
            for num_seqs, record in enumerate(SeqIO.parse(file, 'centrifuge')):
                tid: TaxId = record.annotations['taxID']
                if tid not in taxids:
                    continue  # Ignore read if low confidence
                score: Score = Score(record.annotations['score'])
                if args.minscore is not None and score < args.minscore:
                    continue
                records.append(record)
    except FileNotFoundError:
        raise Exception(red('ERROR!') + 'Cannot read "' + output_file + '"')
    print(green(' OK!'))

    # Basic records statistics
    print(
        gray('  Load elapsed time: ') +
        f'{time.perf_counter() - start_time_load:.3g}' + gray(' sec'))
    print(f'  \033[90mMatching reads: \033[0m{len(records):_d} \033[90m\t'
          f'(\033[0m{len(records)/num_seqs:.4%}\033[90m of sample)')
    sys.stdout.flush()

    # FASTQ sequence dealing
    # records_ids: List[SeqRecord] = [record.id for record in records]
    records_ids: Set[SeqRecord] = {record.id for record in records}
    seqs1: List[SeqRecord] = []
    seqs2: List[SeqRecord] = []
    extracted: int = 0
    i: int = 0
    if fastq_2:
        print(
            f'\033[90mLoading FASTQ files {fastq_1} and {fastq_2}...\n'
            f'Mseqs: \033[0m',
            end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1, open(fastq_2, 'rU') as file2:
                for i, (rec1, rec2) in enumerate(
                        zip(SeqIO.parse(file1, 'quickfastq'),
                            SeqIO.parse(file2, 'quickfastq'))):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        seqs2.append(rec2)
                        extracted += 1

        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ files')
    else:
        print(f'\033[90mLoading FASTQ files {fastq_1}...\n'
              f'Mseqs: \033[0m',
              end='')
        sys.stdout.flush()
        try:
            with open(fastq_1, 'rU') as file1:
                for i, rec1 in enumerate(SeqIO.parse(file1, 'quickfastq')):
                    if not records_ids or (args.maxreads and i >= args.maxreads
                                           ) or (args.limit
                                                 and extracted >= args.limit):
                        break
                    elif not i % 1000000:
                        print(f'{i//1000000:_d}', end='')
                        sys.stdout.flush()
                    elif not i % 100000:
                        print('.', end='')
                        sys.stdout.flush()
                    try:
                        records_ids.remove(rec1.id)
                    except KeyError:
                        pass
                    else:
                        seqs1.append(rec1)
                        extracted += 1
        except FileNotFoundError:
            raise Exception('\n\033[91mERROR!\033[0m Cannot read FASTQ file')
    print(cyan(f' {i/1e+6:.3g} Mseqs'), green('OK! '))

    def format_filename(fastq: Filename) -> Filename:
        """Auxiliary function to properly format the output filenames.

        Args:
            fastq: Complete filename of the fastq input file

        Returns: Filename of the rextracted fastq output file
        """
        fastq_filename, _ = os.path.splitext(fastq)
        output_list: List[str] = [fastq_filename, '_rxtr']
        if including:
            output_list.append('_incl')
            output_list.extend('_'.join(including))
        if excluding:
            output_list.append('_excl')
            output_list.extend('_'.join(excluding))
        output_list.append('.fastq')
        return Filename(''.join(output_list))

    filename1: Filename = format_filename(fastq_1)
    SeqIO.write(seqs1, filename1, 'quickfastq')
    print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'), filename1)
    if fastq_2:
        filename2: Filename = format_filename(fastq_2)
        SeqIO.write(seqs2, filename2, 'quickfastq')
        print(gray('Wrote'), magenta(f'{len(seqs1)}'), gray('reads in'),
              filename2)

    # Timing results
    print(gray('Total elapsed time:'),
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))

예제 #5

파일 보기

파일: centrifuge.py 프로젝트: pamag/recentrifuge

def process_report(
        *args, **kwargs
) -> Tuple[Sample, TaxTree, SampleDataByTaxId, SampleStats, Err]:
    """
    Process Centrifuge/Kraken report files (to be usually called in parallel!).
    """
    # TODO: Full review to report support
    # Recover input and parameters
    filerep: Filename = args[0]
    taxonomy: Taxonomy = kwargs['taxonomy']
    mintaxa: int = kwargs['mintaxa']
    collapse: bool = taxonomy.collapse
    including: Set[TaxId] = taxonomy.including
    excluding: Set[TaxId] = taxonomy.excluding
    debug: bool = kwargs['debug']
    output: io.StringIO = io.StringIO(newline='')

    def vwrite(*args):
        """Print only if verbose/debug mode is enabled"""
        if kwargs['debug']:
            output.write(' '.join(str(item) for item in args))

    sample: Sample = Sample(filerep)

    # Read Centrifuge/Kraken report file to get abundances
    log: str
    abundances: Counter[TaxId]
    log, abundances, _ = read_report(filerep)
    output.write(log)
    # Remove root counts, in case
    if kwargs['root']:
        vwrite(gray('Removing'), abundances[ROOT], gray('"ROOT" reads... '))
        abundances[ROOT] = 0
        vwrite(green('OK!'), '\n')

    # Build taxonomy tree
    output.write('  \033[90mBuilding taxonomy tree...\033[0m')
    tree = TaxTree()
    tree.grow(taxonomy=taxonomy,
              counts=abundances)  # Grow tax tree from root node
    output.write('\033[92m OK! \033[0m\n')

    # Prune the tree
    output.write('  \033[90mPruning taxonomy tree...\033[0m')
    tree.prune(mintaxa, None, collapse, debug)
    tree.shape()
    output.write('\033[92m OK! \033[0m\n')

    # Get the taxa with their abundances and taxonomical levels
    output.write('  \033[90mFiltering taxa...\033[0m')
    new_abund: Counter[TaxId] = col.Counter()
    new_accs: Counter[TaxId] = col.Counter()
    ranks: Ranks = Ranks({})
    tree.get_taxa(abundance=new_abund,
                  accs=new_accs,
                  ranks=ranks,
                  mindepth=0,
                  maxdepth=0,
                  include=including,
                  exclude=excluding)
    new_abund = +new_abund  # remove zero and negative counts
    if including or excluding:  # Recalculate accumulated counts
        new_tree = TaxTree()
        new_tree.grow(taxonomy, new_abund)  # Grow tree with new abund
        new_tree.shape()
        new_abund = col.Counter()  # Reset abundances
        new_accs = col.Counter()  # Reset accumulated
        new_tree.get_taxa(new_abund, new_accs)  # Get new accumulated counts
    out: SampleDataByTaxId = SampleDataByTaxId()
    out.set(counts=new_abund, ranks=ranks, accs=new_accs)
    output.write('\033[92m OK! \033[0m\n')
    print(output.getvalue())
    sys.stdout.flush()
    return sample, tree, out, SampleStats(), Err.NO_ERROR