示例#1
0
def main():

    parser = optparse.OptionParser()

    parser.add_option( '-i', '--inputfile',
                       dest='inputfilename',
                       help='blast output file, in xml format.',
                       metavar='FILE.xml' )

    parser.add_option( '-o', '--outputfile',
                       dest='outputfilename',
                       help='base output filename',
                       metavar='FILE' )

    parser.add_option( '-d', '--db',
                       dest='database',
                       help='database from which the sequences should be fetched.',
                       metavar='FILE' )

    parser.add_option( '-e', '--evalue',
                       dest='evalue',
                       type='float',
                       help='e-value threshold.',
                       metavar='FLOAT' )

    parser.add_option( '-E', '--start_expo_evalue',
                       dest='startexpoeval',
                       type='int',
                       help='exponent of the evalue threshold used when refiltering.',
                       metavar='INT' )
    
    parser.add_option( '-b', '--blast_version',
                       dest='blastversion',
                       help='set the blast version to use, either `legacy` or `plus`.',
                       metavar='VERSION' )
    
    parser.add_option( '-f', '--filter',
                       action='store_true', dest='dofilter', default=False,
                       help='do the filter step.')

    parser.add_option( '-p', '--keep_patterns_iff',
                       dest='keeppatiff',
                       help='Keep only if patterns match exactly. The patterns should be coma seperated.',
                       metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' )

    parser.add_option( '-q', '--keep_patterns',
                       dest='keeppat',
                       help='Keep patterns that match exactly, no matter what. The patterns should be coma seperated.',
                       metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' )

    parser.add_option( '-g', '--gis',
                       dest='gis',
                       help='pickle file containing the gis that should match',
                       metavar='FILE')

    parser.add_option( '-F', '--format',
                       dest='formatop',
                       help='format of the output. default is `header,evalue`',
                       metavar='INTEGER' )
                       
    parser.add_option( '-M', '--max_num_start_seq',
                       dest='maxnumstartseq',
                       type='int',
                       help='maximum number of sequences in the first alignement to be' +\
                       'processed. If set, a new input file with the top sequences ordered' +\
                       'by evalue is created and used.',
                       metavar='INTEGER' )
    
    parser.add_option( '-k', '--keep_U',
                       action='store_true', dest='keepu', default=False,
                       help='Should U containing sequences be kept regardless of their evalues ?.'+\
                       'Use in conjunction of -M')

    parser.add_option( '-T', '--temp',
                       dest='temp',
                       help='set the temp folder to use.',
                       metavar='FOLDER' )

    parser.add_option( '-P', '--parse',
                       dest='parse', action='store_true', default=False,
                       help='do not do extra fancy steps. Just parse the file and return the disired output in a file.' )

    parser.add_option( '-U', '--uniq',
                       dest='uniq', action='store_true', default=False,
                       help='remove duplicates.' )
    
    parser.add_option( '-v', '--verbose',
                       dest='verbosity',
                       type='int',
                       help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full',
                       metavar='INTEGER' )

    parser.set_defaults( verbosity = 1,
                         database = 'nr',
                         evalue = 10,
                         startexpoeval = -10,
                         keeppat = None,
                         blastversion = 'legacy',
                         temp = '/tmp/',
                         maxnumstartseq = None,
                         formatop = 'header,evalue')

    (options, args) = parser.parse_args()

    verbosity = options.verbosity
    database = options.database
    evalue = options.evalue
    temp = options.temp
    maxnumstartseq = options.maxnumstartseq

    blastindexfile = ''.join(( options.outputfilename, '.index.0' ))
    blastfastafile = ''.join(( options.outputfilename, '.fasta.0' ))

    os.system(' '.join(( 'touch', blastindexfile )))
    os.system(' '.join(( 'touch', blastfastafile )))

    if options.blastversion == 'legacy':
        fetcher = FastaCmdWrapper( entry=[],
                                   db=database,
                                   outfile=blastfastafile )
    else:
        fetcher = BlastDbCmdWrapper( entry=[],
                                     db=database,
                                     outfile=blastfastafile )

    ## Parse the blast output file.
    if options.parse:
        if verbosity >= 1:
            sys.stderr.write( '\n' )
            sys.stderr.write( '>>> Parsing blast output : ' +\
                              options.inputfilename + '\n' )
        with open(options.inputfilename, 'r') as infile:
            blastparser = PsiBlastXMLParser(infile)
            blastparser.parse()
            if verbosity >= 2:
                sys.stderr.write('    >>> Extracting required data.\n')
            if options.dofilter:
                sequences = blastparser.extractData( evalue=evalue,
                                                     fmt=options.formatop,
                                                     outfile=blastindexfile,
                                                     includepatternsiff=fmtOptPat(options.keeppatiff),
                                                     includepatterns=fmtOptPat(options.keeppat),
                                                     excludepatterns=({'title':['hypothetical', 'predicted', 'PREDICTED']}))
            else:
                sequences = blastparser.extractData( evalue=evalue,
                                                     fmt=options.formatop,
                                                     outfile=blastindexfile )
        
    ## Only keep one copy of a header, the one with the best evalue.
    if options.uniq:
        if verbosity >= 1:
            sys.stderr.write( '\n' )
            sys.stderr.write( '>>> Keeping only best evalues.\n' )
        uniq(blastindexfile)
    
    ## Gather all GIs in list
    if verbosity >= 2:
        sys.stderr.write( '\n' )
        sys.stderr.write( '>>> Gathering all Gis.\n' )
    entries = []
    with open(blastindexfile, 'r') as bif:
        for line in bif:
            entries.append(line.split('|')[1])
    fetcher.entry = entries

    ## Fetch the sequences from the local databases.
    ## TODO : Fetch failed from the web.
    if verbosity >= 1:
        sys.stderr.write( '\n' )
        sys.stderr.write( '>>> Building fasta.0 file by fetching sequences from local database.\n' )
    fetcher.run()

    ## Apply final filters : keep only top evalues and U containing until a threshold is reached
    if maxnumstartseq:
        if verbosity >= 1:
            sys.stderr.write( '\n' )
            sys.stderr.write( '>>> Applying final filters on ' + \
                              blastfastafile + '.\n' )
        if verbosity >= 2:
            sys.stderr.write( '    >>> Adding evalue to headers.\n' )
        ### TODO : use .fasta.fh in tmp dir.
        tmpfullheadfasta = blastfastafile + '.fh'
        addheaders = AddFullHeadersWrapper2(blastfastafile,
                                           tmpfullheadfasta,
                                           blastindexfile)
        addheaders.run()
        if verbosity >= 3:
            sys.stderr.write( '        >>> Loading sequences.\n' )
        with open(tmpfullheadfasta, 'r') as ff:
            allseqs = Fasta.loadSequences(ff)
        if verbosity >= 2:
            sys.stderr.write( '    >>> Keeping valid sequences.\n' )
        tmppat = None
        if options.keepu:
            tmppat = 'U'
        validseqs = getTopSeqs(seqs=allseqs,
                               maxnumseqs=maxnumstartseq,
                               startevalue=options.startexpoeval,
                               pattern=tmppat,
                               verbose=verbosity>=4 )
        keptseqs = '.'.join(( options.outputfilename,
                              str(validseqs[1]),
                              str(len(validseqs[0])),
                              'fasta' ))
        if verbosity >= 2:
            sys.stderr.write( '    >>> Found ' + str(len(validseqs[0])) + \
                              ' sequences with evalue <= 1e' + \
                              str(validseqs[1]) + '\n' )
        with open(keptseqs, 'w') as ff:
            validseqs[0].save(ff)

    sys.stderr.write( '\n' )
示例#2
0
def main():

    parser = optparse.OptionParser()

    fetchgroup = optparse.OptionGroup(parser, 'Options to work with a GI')
    blastgroup = optparse.OptionGroup(parser, 'Blast related options')

    fetchgroup.add_option('-s', '--entry',
                          dest='gi_entry',
                          help='GI to check against the database',
                          metavar='GI')

    fetchgroup.add_option('-D', '--database_fetch',
                          dest='dbf',
                          help='location of the database that should be used for fetching the sequence from the gi provided.',
                          metavar='DB')

    blastgroup.add_option('-b', '--blast_flavour',
                      dest='blast_flavour',
                      help='what kind of blast should be performed ?',
                      metavar='BLAST')

    blastgroup.add_option('-d', '--database_check',
                          dest='dbc',
                          help='location of the database that should be used for checking.',
                          metavar='DB')

    blastgroup.add_option('-a', '--ncore',
                          dest='ncore',
                          type='int',
                          help='number of cores to use for the blast.',
                          metavar='INT')

    parser.add_option('-q', '--query',
                      dest='fasta_query',
                      help='query in fasta format',
                      metavar='FILE')

    parser.add_option('-o', '--output',
                      dest='outputfile',
                      help='name of the output file. default is stdout',
                      metavar='FILE')

    parser.add_option('-n', '--num_top_hits',
                      dest='num_top_hits',
                      type='int',
                      help='number of top hits to consider.',
                      metavar='INT')

    parser.add_option('-f', '--filters_file',
                      dest='filters_file',
                      help='location of the file containing filters.',
                      metavar='FILE')

    parser.add_option('-v', '--verbosity',
                      dest='verbosity', action='count',
                      help='set verbosity level')

    parser.add_option('-T', '--temp',
                      dest='temp',
                      help='temporary folder.',
                      metavar='DIR')

    parser.add_option_group(fetchgroup)
    parser.add_option_group(blastgroup)

    parser.set_defaults(temp = '/tmp/',
                        ncore = 1,
                        num_top_hits = 1,
                        outputfile = sys.stdout)

    (options, args) = parser.parse_args()

    if len(sys.argv) == 1:
        parser.error('No options specified. check_with_blast.py --help for details.')

    log_level = logging.WARNING
    if options.verbosity == 1:
        log_level = logging.INFO
    elif options.verbosity >= 2:
        log_level = logging.DEBUG
    logging.basicConfig(level=log_level,
                        format='%(levelname)-6s:%(filename)s  %(message)s')

    if options.filters_file:
        includefilters = parsefilters(options.filters_file)
    else:
        includefilters = None
    logging.info('Filters : '+str(includefilters))

    if options.gi_entry:
        outputentryfa = os.path.join(options.temp, options.gi_entry + '.fa')
        outputblast = os.path.join(options.temp, options.gi_entry + '.xml')
        outputpf = os.path.join(options.temp, options.gi_entry + '.index')
        fetcher = FastaCmdWrapper([options.gi_entry], db=options.dbf,
                                  outfile=outputentryfa)
        blastqueryfile = outputentryfa
    elif options.fasta_query:
        outputblast = os.path.join(options.temp,
                                   os.path.basename(options.fasta_query) \
                                   + '.xml')
        outputpf = os.path.join(options.temp,
                                os.path.basename(options.fasta_query) \
                                + '.index')
        blastqueryfile = options.fasta_query
    
    blaster = BlastAllWrapper(blastqueryfile, outputblast,
                              flavour=options.blast_flavour,
                              db=options.dbc, gis=True, ncore=options.ncore)

    xmlparser = PsiBlastXMLParser(outputblast)

    if options.gi_entry:
        logging.info('Fetching the sequence from local database : '+fetcher.cline)
        fetcher.run()
    logging.info('Running blast : '+blaster.cline)
    blaster.run()

    with open(outputblast, 'r') as iff:
        logging.info('Parsing the xml output -> '+outputpf)
        xmlparser = PsiBlastXMLParser(iff)
        xmlparser.parse()
        xmlparser.extractData(fmt='evalue,header', outfile=outputpf)

    results = HeadEvalueDict()

    with open(outputpf, 'r') as iff:
        for line in iff:
            evalue = line.split()[0]
            header = ' '.join(line.split()[1:])
            if evalue not in results:
                results[evalue] = [header]
            else:
                results[evalue].append(header)

    topindexes = results.keys()
    topindexes.sort(lambda e1, e2: cmp(float(e1), float(e2)))
    
    topseqs = [(e, results[e]) for e in topindexes[:options.num_top_hits]]

    finaloutput = []

    for eseq in topseqs:
        for header in eseq[1]:
            if options.filters_file:
                for ikw in includefilters:
                    if ikw in header:
                        finaloutput.append((eseq[0], header))
            else:
                finaloutput.append((eseq[0], header))
    logging.debug(str(finaloutput))
    try:
        if options.outputfile != sys.stdout:
            off = open(options.outputfile, 'w')
        else:
            off = sys.stdout
        off.write('# '+str(includefilters)+'\n')
        for oo in finaloutput:
            off.write(oo[0]+' : '+oo[1]+'\n')
    except Exception, (e):
        print e