Пример #1
0
def main(args, parser):
    """
    Build the protein database.

    @param args: The namespace of command-line arguments returned by
        argparse.parse_args()
    @param parser: An C{argparse.ArgumentParser} instance.
    """

    if (args.minGenomeLength is not None and args.maxGenomeLength is not None
            and args.minGenomeLength > args.maxGenomeLength):
        raise ValueError(
            '--minGenomeLength cannot be larger than --maxGenomeLength')

    if args.excludeExclusiveHost:
        excludeExclusiveHosts = set(
            chain.from_iterable(args.excludeExclusiveHost))
    else:
        excludeExclusiveHosts = None

    taxonomyDatabase = parseTaxonomyDatabaseCommandLineOptions(args, parser)
    progress = args.progress

    if progress:
        overallStart = time()
        totalGenomeCount = totalProteinCount = 0

    with SqliteIndexWriter(args.databaseFile) as db:
        for fileCount, (filename, addFunc,
                        type_) in enumerate(filenamesAndAdders(args, db),
                                            start=1):

            if args.logFile:
                print("\n>>> Indexing '%s'." % filename,
                      end='\n\n',
                      file=args.logFile)

            if progress:
                start = time()

            examinedGenomeCount, genomeCount, proteinCount = addFunc(
                filename,
                dnaOnly=args.dnaOnly,
                rnaOnly=args.rnaOnly,
                minGenomeLength=args.minGenomeLength,
                maxGenomeLength=args.maxGenomeLength,
                excludeExclusiveHosts=excludeExclusiveHosts,
                excludeFungusOnlyViruses=args.excludeFungusOnlyViruses,
                excludePlantOnlyViruses=args.excludePlantOnlyViruses,
                databaseName=args.databaseName,
                taxonomyDatabase=taxonomyDatabase,
                proteinSource=args.proteinSource,
                genomeSource=args.genomeSource,
                duplicationPolicy=args.duplicationPolicy,
                logfp=args.logFile)

            if examinedGenomeCount == 0:
                if type_ == 'gb':
                    print('WARNING: No genomes found in %r. Did the GenBank '
                          'download fail on that file?' % filename,
                          file=sys.stderr)
                else:
                    assert type_ == 'json'
                    print('WARNING: no genomes found in JSON file %r.' %
                          filename,
                          file=sys.stderr)

            if progress:
                elapsed = time() - start
                totalGenomeCount += genomeCount
                totalProteinCount += proteinCount
                print('Processed %r: added %3d of %3d genome%s (%5d '
                      'protein%s) in %.2f seconds.' %
                      (filename, genomeCount, examinedGenomeCount,
                       ' ' if examinedGenomeCount == 1 else 's', proteinCount,
                       '' if proteinCount == 1 else 's', elapsed),
                      file=sys.stderr)

    if progress:
        elapsed = time() - overallStart
        print('%d files (containing %d genomes and %d proteins) '
              'indexed in %.2f seconds (%.2f mins).' %
              (fileCount, totalGenomeCount, totalProteinCount, elapsed,
               elapsed / 60),
              file=sys.stderr)
Пример #2
0
        '--database',
        required=True,
        help=('The file holding the sqlite3 taxonomy database. See '
              'https://github.com/acorg/ncbi-taxonomy-database for how to '
              'build one.'))

    parser.add_argument('--printId',
                        default=False,
                        action='store_true',
                        help='If specified, also print the id.')

    addTaxonomyDatabaseCommandLineOptions(parser)

    args = parser.parse_args()

    db = parseTaxonomyDatabaseCommandLineOptions(args, parser)

    if args.ids:
        ids = args.ids
    else:
        ids = (line[:-1] for line in sys.stdin)

    for id_ in ids:
        if args.printId:
            print(id_ + ':')
        hosts = hosts(id_, db)
        if hosts:
            print(', '.join(sorted(hosts)))
        else:
            print(
                'No host information for %r found in the taxonomy database.' %