Пример #1
0
def main():

    parser = optparse.OptionParser()

    parser.add_option( '-i', '--inputfile',
                       dest='inputfilename',
                       help='fasta file in which selenoproteins should be looked for.',
                       metavar='FILE' )

    parser.add_option( '-o', '--outputfile',
                       dest='outputfilename',
                       help='fasta file containing the selenoproteins',
                       metavar='FILE' )

    parser.add_option( '-v', '--verbose',
                       dest='verbosity',
                       help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full',
                       metavar='INTEGER' )

    parser.set_defaults( verbosity = '1' )

    (options, args) = parser.parse_args()

    stdoutflag = False

    verbosity = int( options.verbosity )

    if options.inputfilename:

        inputfilenames = options.inputfilename.split(',')
        infiles = []
        for i in inputfilenames:
            infiles.append( open( i, 'r' ) )

    else: sys.exit( 'You must provide an input filename.')

    if options.outputfilename:

        outfile = open( options.outputfilename, 'w' )
        stdoutflag = True

    else: outfile = sys.stdout

    for f in infiles:
        if verbosity >= 1:
            print
            print '>>> Searching for selenoproteins in file ' + f.name
            print

        if verbosity >= 2:
            print '>>> Loading sequences ...'

        sequences = FastaLib.loadSequences( f )

        if verbosity >= 2:
            print '>>> ... Done.'
            print

        if verbosity >= 2:
            print '>>> Searching for U containing sequences ...'

        selenoproteins = findSelenoproteins( sequences )

        if verbosity >= 2:
            print '>>> ... Done.'
            print

        FastaLib.saveSequences(selenoproteins, outfile)

        for selP in selenoproteins:

            if verbosity >= 3 and stdoutflag:
                print selP.header.strip()
                print selP.sequence.strip()

        if verbosity >= 1:
            print
            print 'Found ' + str( len( selenoproteins ) ) + ' selenoproteins'
            print

    for i in infiles:
        i.close()
    outfile.close()
Пример #2
0
def main():

    parser = optparse.OptionParser()

    parser.add_option( '-i', '--inputfile',
                       dest='inputfilename',
                       help='file containing the alignments that will be used to build the PSSM using prepare_alignment_selenoprofiles.py.',
                       metavar='FILE' )

    parser.add_option( '-r', '--datadir',
                       dest='datadir',
                       help='directory containing, for each familly FAM, a directory FAM.blast and a directory FAM.selenoprofiles.prep',
                       metavar='DIR' )

    parser.add_option( '-o', '--outputfile',
                       dest='outputfilename',
                       help='base name used for outputs',
                       metavar='NAME' )

    parser.add_option( '-a', '--n_core',
                       dest='ncore',
                       type='int',
                       help='number of cores to use during the various operations.',
                       metavar='INTEGER' )

    parser.add_option( '-M', '--mafft',
                       action='store_true', dest='domafft', default=False,
                       help='do the mafft step.')
    
    parser.add_option( '-T', '--trimal',
                       action='store_true', dest='dotrimal', default=False,
                       help='do the trimal step.')

    parser.add_option( '-C', '--tcoffee',
                       action='store_true', dest='dotcoffee', default=False,
                       help='do the t_coffee step.')

    parser.add_option( '-B', '--headers',
                       action='store_true', dest='doheaders', default=False,
                       help='do the addheaders step.')

    parser.add_option( '-p', '--patternfile',
                       dest='patternfile',
                       help='pattern file to use if the -D option is used.',
                       metavar='FILE' )

    parser.add_option( '-F', '--filter',
                       action='store_true', dest='dofilter', default=False,
                       help='do the filter step.')

    parser.add_option( '-P', '--prepare',
                       action='store_true', dest='doprepal', default=False,
                       help='do the prepare_alignment_selenoprofiles step.')

    parser.add_option( '-g', '--tag_threshold',
                       dest='tagthreshold',
                       type='float',
                       help='tag threshold to use if the -P or --prepare is used.',
                       metavar='FLOAT' )

    parser.add_option( '-A', '--all',
                       action='store_true', dest='doall', default=False,
                       help='do all steps.')

    parser.add_option( '-Y', '--dry',
                       action='store_true', dest='dryrun', default=False,
                       help="Prints the commands without executing them.")
    
    parser.add_option( '-D', '--debug',
                       action='store_true', dest='debug', default=False,
                       help="Debug mode. Nothing is cleaned.")

    parser.add_option( '-t', '--temp',
                       dest='temp',
                       help='set the temp folder to use.',
                       metavar='FOLDER' )

    parser.add_option( '-v', '--verbose',
                       dest='verbosity',
                       type='int',
                       help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full',
                       metavar='INTEGER' )



    parser.set_defaults( verbosity = 1,
                         ncore = 1,
                         tagthreshold = 0.5,
                         temp = '/tmp/',
                         patternfile = 'None' )

    (options, args) = parser.parse_args()

    if options.doall:
        options.doheaders = True
        options.dofilter = True
        options.domafft = True
        options.dotrimal = True
        options.dotcoffee = True
        
    infile = options.inputfilename
    tmpinitfilename = genTempfilename(options.temp, 'ungapped_')
    with open(infile, 'r') as iff:
        tmpseqs = Fasta.loadSequences(iff)
    with open(tmpinitfilename, 'w') as ugf:
        for seq in tmpseqs:
            removeGaps(seq).prints(ugf)
    tmpinfile = tmpinitfilename

    mafftoutfile = ''.join((options.outputfilename, '_mafft.fasta'))
    trimaloutfile1 = ''.join((options.outputfilename, '_trimmed_native.fasta'))
    trimaloutfile2 = ''.join((options.outputfilename, '_trimmed_spadded.fasta'))
    trimaloutfile = trimaloutfile1
    tcoffeeoutfile = ''.join((options.outputfilename, '_tcoffee.fasta'))
    fullheadoutfile = ''.join((options.outputfilename, '.det.fasta'))
#    patternfile = ''.join(('.'.join(options.inputfilename.split('.')[:2]), '.index.0'))
    patternfile = options.patternfile
    filteroutfile = ''.join((options.outputfilename, '.filt.fasta'))

    ncore = options.ncore
    verbosity = options.verbosity
    temp = options.temp

    addheaders = UtilityWrappers.AddFullHeadersWrapper2(tmpinfile,
                                                        fullheadoutfile,
                                                        patternfile)

    filterseqs = UtilityWrappers.FilterWrapper(tmpinfile,
                                               filteroutfile,
                                               inverse=True,
                                               titlematch=('PREDICTED', 'predicted', 'hypothetical'))

    mafft = UtilityWrappers.MafftWrapper(tmpinfile,
                                         mafftoutfile,
                                         auto=True)

    trimal = UtilityWrappers.TrimalWrapper(tmpinfile,
                                           trimaloutfile1,
                                           clusters=100)

    tcoffee = UtilityWrappers.TcoffeeWrapper(trimaloutfile2,
                                             tcoffeeoutfile,
                                             ncore=ncore)

    prepsp = UtilityWrappers.SelenoprofilesPreWrapper(tmpinfile,
                                                      options.outputfilename,
                                                      all=True,
                                                      tagthreshold=options.tagthreshold,
                                                      temp=temp)

    try:

        if options.dryrun:
            print('\nThis is a dry run. Relaunch the command without the option -Y to do the actual stuff.\n')



        ## Add full headers
    ##     if options.doheaders:
    ##         addheader.infile = tmpinfile
    ##         tmpinfile = fullheadoutfile
    ##         if options.dryrun:
    ##             print addheaders.cline
    ##         else:
    ##             if verbosity >= 1:
    ##                 sys.stderr.write('\n    >>> Adding headers\n\n')
    ##             addheaders.run()

        ## Filter out the 'fake' proteins
        if options.dofilter:
            time.sleep(0.5)
            filterseqs.infile = tmpinfile
            tmpinfile = filteroutfile
            if options.dryrun:
                print filterseqs.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Filtering out\n\n')
                filterseqs.run()

        ## run mafft
        numseqinmafftoutput = 0 
        if options.domafft:
            time.sleep(0.5)
            mafft.infile = tmpinfile
            tmpinfile = mafftoutfile
            if options.dryrun:
                print mafft.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Running Mafft\n\n')
                mafft.run()
                with open(mafftoutfile, 'r') as mfo:
                    seqs = Fasta.loadSequences(mfo)
                    numseqinmafftoutput = len(seqs)

        ## run trimal
        if options.dotrimal and numseqinmafftoutput > 200:
            time.sleep(0.5)
            trimal.infile = tmpinfile
            tmpinfile = trimaloutfile1
            if options.dryrun:
                print trimal.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Running Trimal\n\n')
                trimal.run()

        if not options.dryrun and options.dotcoffee and options.dotrimal:
            if verbosity >= 1:
                sys.stderr.write('\n    >>> Removing gaps\n\n')

            ti = open(tmpinfile, 'r')
            tmpinfile = trimaloutfile2
            to = open(tmpinfile, 'w')

            si = Fasta.loadSequences(ti)
            ti.close()
            refs = Fasta.SequenceList()

            ## saves the sequences with no gaps
            for s in si:
                refs.append(removeGaps(s))
            Fasta.saveSequences(refs, to)

            if options.dotrimal and numseqinmafftoutput > 200:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Adding ommited selenoproteins\n')
                ## Gather the non intersecting proteins from the 2 files
                diffSelenoproteins = spDiff( mafftoutfile,
                                             trimaloutfile1 )

                spDiffr = Fasta.SequenceList()
                ## remove gaps from selenoproteins
                for s in diffSelenoproteins:
                    spDiffr.append(removeGaps(s))
                ## append to the file the selenoproteins that were not present
                Fasta.saveSequences(spDiffr, to)
            to.close()

        ## run t_coffee
        if options.dotcoffee:
            time.sleep(0.5)
            tcoffee.infile = tmpinfile
            tmpinfile = tcoffeeoutfile
            if options.dryrun:
                print tcoffee.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Running T_coffee\n\n')
                tcoffee.run()

        ## Add full headers
        if options.doheaders:
            time.sleep(0.5)
            addheaders.infile = tmpinfile
            tmpinfile = fullheadoutfile
            if options.dryrun:
                print addheaders.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> Adding headers\n\n')
                addheaders.run()

        ## prepare alignments for selenoprofiles
        if options.doprepal:
            time.sleep(0.5)
            prepsp.infile = tmpinfile
            if options.dryrun:
                print prepsp.cline
            else:
                if verbosity >= 1:
                    sys.stderr.write('\n    >>> preparing for selenoprofiles\n\n')
                prepsp.run()

    except KeyboardInterrupt:
        sys.exit('manual exit.')
    finally:
        if not options.debug:
            if verbosity >= 2:
                sys.stderr.write('\n    >>> Removing temporary file ' + tmpinitfilename +'\n\n')
            os.remove(tmpinitfilename)