Exemplo n.º 1
0
def Assemble(input,output):
   # turn off fail fast option
   setFailFast(False)

   originalPrefix = _settings.PREFIX
   originalKmer = _settings.kmer
   asmPrefix = output.replace("%s/Assemble/out/"%(_settings.rundir), "")
   asmPrefix = asmPrefix.replace(".asm.contig", "")
   asmName = input.replace("%s/Assemble/out/"%(_settings.rundir), "")
   asmName = asmName.replace(".run", "")
   isContig = os.path.exists("%s/Preprocess/out/%s.asm.contig"%(_settings.rundir, asmName))

   if (len(asmName.split(".")) > 1) and not isContig:
      (asmName, kmer) = asmName.split(".")
      _settings.kmer = int(kmer)
   _settings.PREFIX = asmPrefix

   mated = False
   for lib in _readlibs:
      if lib.mated:
         mated = True
         break

   if "Assemble" in _skipsteps or "assemble" in _skipsteps:
      run_process(_settings, "touch %s/Logs/assemble.skip"%(_settings.rundir), "Assemble")
      return 0

   if os.path.exists("%s/Preprocess/out/%s.asm.contig"%(_settings.rundir, asmName)):
      # we had contigs input
      run_process(_settings, "unlink %s/Assemble/out/%s.asm.contig"%(_settings.rundir, asmName), "Assemble")
      run_process(_settings, "ln %s/Preprocess/out/%s.asm.contig %s/Assemble/out/%s.asm.contig"%(_settings.rundir, asmName, _settings.rundir, asmName), "Assemble")
   #pick assembler
   elif asmName == "none" or asmName == None:
      pass
   elif asmName == "soapdenovo" or asmName == "soapdenovo2":
      #open & update config
      soapf = open("%s/config.txt"%(_settings.rundir),'r')
      soapd = soapf.read()
      soapf.close()
      cnt = 1
      libno = 1
      #print libs

      for lib in _readlibs:
         if (lib.format == "fastq" or lib.format == "fasta")  and lib.mated and not lib.interleaved:
             soapd = soapd.replace("LIB%dQ1REPLACE"%(lib.id),"%s/Preprocess/out/lib%d.1.fastq"%(_settings.rundir,lib.id))
             soapd = soapd.replace("LIB%dQ2REPLACE"%(lib.id),"%s/Preprocess/out/lib%d.2.fastq"%(_settings.rundir,lib.id))
         elif lib.format == "fastq"  and lib.mated and lib.interleaved:
             #this is NOT supported by SOAP, make sure files are split into two..
             #need to update lib.f2 path
             run_process(_settings, "perl %s/perl/split_fastq.pl %s/Preprocess/out/%s %s/Assemble/in/%s %s/Assemble/in/%s.f2"%(_settings.METAMOS_UTILS,_settings.rundir,lib.f1.fname,_settings.rundir,lib.f1.fname,_settings.rundir,lib.f1.fname),"Assemble")
             soapd = soapd.replace("LIB%dQ1REPLACE"%(lib.id),"%s/Assemble/in/%s"%(_settings.rundir,lib.f1.fname))
             soapd = soapd.replace("LIB%dQ2REPLACE"%(lib.id),"%s/Assemble/in/%s"%(_settings.rundir,lib.f1.fname+".f2"))
         elif lib.format == "fasta"  and lib.mated:
             soapd = soapd.replace("LIB%dQ1REPLACE"%(lib.id),"%s/Preprocess/out/lib%d.1.fastq"%(_settings.rundir,lib.id))
             soapd = soapd.replace("LIB%dQ2REPLACE"%(lib.id),"%s/Preprocess/out/lib%d.2.fastq"%(_settings.rundir,lib.id))
         else:
             soapd = soapd.replace("LIB%dQ1REPLACE"%(lib.id),"%s/Preprocess/out/lib%d.fastq"%(_settings.rundir,lib.id))

      #cnt +=1
      soapw = open("%s/soapconfig.txt"%(_settings.rundir),'w')
      soapw.write(soapd)
      soapw.close()

      specName = "soap.spec"
      configName = "%s/soapconfig.txt"%(_settings.rundir)
      run_process(_settings, "cat %s |grep -v max_rd_len > %s/soap2config.txt"%(configName, _settings.rundir), "Assemble")
      binPath = _settings.SOAPDENOVO
      if asmName == "soapdenovo2":
         configName = "%s/soap2config.txt"%(_settings.rundir)
         binPath = _settings.SOAPDENOVO2
         specName = "soap2.spec"

      if not os.path.exists(binPath + os.sep + "SOAPdenovo-63mer"):
         print "Error: %s not found in %s. Please check your path and try again.\n"%(asmName.title(), binPath)
         run_process(_settings, "touch %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "Assemble")
         _settings.kmer = originalKmer
         _settings.PREFIX = originalPrefix
         setFailFast(True)
         return

      soapOptions = getProgramParams(_settings.METAMOS_UTILS, specName, "pregraph", "-") 
      soapContigOptions = getProgramParams(_settings.METAMOS_UTILS, specName, "contig", "-")
      soapMapOptions = getProgramParams(_settings.METAMOS_UTILS, specName, "map", "-")
      soapScaffOptions = getProgramParams(_settings.METAMOS_UTILS, specName, "scaff", "-") 

      #start stopwatch
      soapEXE="SOAPdenovo-63mer"
      if _settings.kmer > 63:
         soapEXE="SOAPdenovo-127mer"

      run_process(_settings, "%s/%s pregraph -p %d -K %d %s -s %s -o %s/Assemble/out/%s.asm"%(binPath, soapEXE, _settings.threads, _settings.kmer, soapOptions, configName,_settings.rundir,_settings.PREFIX),"Assemble")#SOAPdenovo config.txt
      run_process(_settings, "%s/%s contig -g %s/Assemble/out/%s.asm %s"%(binPath, soapEXE, _settings.rundir,_settings.PREFIX, soapContigOptions),"Assemble")#SOAPdenovo config.txt

      if _settings.doscaffolding and mated:
         run_process(_settings, "%s/%s map -g %s/Assemble/out/%s.asm -p %d %s -s %s"%(binPath, soapEXE, _settings.rundir,_settings.PREFIX, _settings.threads, soapMapOptions, configName),"Assemble")#SOAPdenovo config.txt
         run_process(_settings, "%s/%s scaff -g %s/Assemble/out/%s.asm -p %d %s"%(binPath, soapEXE, _settings.rundir,_settings.PREFIX, _settings.threads, soapScaffOptions),"Assemble")#SOAPdenovo config.txt

         if os.path.exists("%s/Assemble/out/%s.asm.scafSeq"%(_settings.rundir, _settings.PREFIX)):
            if os.path.exists("%s/GapCloser"%(binPath)):
               run_process(_settings, "%s/GapCloser -b %s -o %s/Assemble/out/%s.linearize.scaffolds.final -a %s/Assemble/out/%s.asm.scafSeq -t %d"%(binPath, configName, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.threads), "Assemble")
            else:
               run_process(_settings, "ln %s/Assemble/out/%s.asm.scafSeq %s/Assemble/out/%s.linearize.scaffolds.final"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX), "Assemble")
            run_process(_settings, "java -cp %s SplitFastaByLetter %s/Assemble/out/%s.linearize.scaffolds.final NNN > %s/Assemble/out/%s.asm.contig"%(_settings.METAMOS_JAVA, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX), "Assemble")

      #if OK, convert output to AMOS
   elif asmName == "metaidba":
      bowtie_mapping = 1
      for lib in _readlibs:
         if lib.format != "fasta"  or (lib.mated and not lib.interleaved):
             print "Warning: meta-IDBA requires reads to be in (interleaved) fasta format, converting library"
         #apparently connect = scaffold? need to convert fastq to interleaved fasta to run, one lib per run??
         #print "%s/metaidba --read %s/Preprocess/out/lib%d.fasta --output  %s/Assemble/out/%s.asm --mink 21 --maxk %d --cover 1 --connect"%(_settings.METAIDBA,_settings.rundir,lib.id,_settings.rundir,_settings.PREFIX,_settings.kmer)

      metaidbaOptions = getProgramParams(_settings.METAMOS_UTILS, "metaidba.spec", "", "--")
      run_process(_settings, "%s/metaidba --read %s/Preprocess/out/lib%d.fasta --output  %s/Assemble/out/%s.asm %s --maxk %d"%(_settings.METAIDBA,_settings.rundir,lib.id,_settings.rundir,_settings.PREFIX,metaidbaOptions,_settings.kmer),"Assemble")
      run_process(_settings, "mv %s/Assemble/out/%s.asm-contig.fa %s/Assemble/out/%s.asm.contig"%(_settings.rundir,_settings.PREFIX,_settings.rundir,_settings.PREFIX),"Assemble")

   elif asmName == "newbler":
      if not os.path.exists(_settings.NEWBLER + os.sep + "newAssembly"):
         print "Error: Newbler not found in %s. Please check your path and try again.\n"%(_settings.NEWBLER)
         run_process(_settings, "touch %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "Assemble")
         _settings.kmer = originalKmer
         _settings.PREFIX = originalPrefix
         setFailFast(True)
         return

      run_process(_settings, "%s/newAssembly -force %s/Assemble/out"%(_settings.NEWBLER, _settings.rundir),"Assemble")

      NEWBLER_VERSION = 0.0;
      p = subprocess.Popen("%s/newAssembly --version | head -n 1"%(_settings.NEWBLER), shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 
      (checkStdout, checkStderr) = p.communicate()
      if checkStderr != "":
        print "Warning: Cannot determine Newbler version"
      else:
         mymatch = re.findall('\d+\.\d+', checkStdout.strip())
         if (len(mymatch) == 1 and mymatch[0] != None):
            NEWBLER_VERSION = float(mymatch[0])

      for lib in _readlibs:
         if lib.format == "fasta":
             run_process(_settings, "%s/addRun %s/Assemble/out %s/Preprocess/out/lib%d.seq"%(_settings.NEWBLER, _settings.rundir, _settings.rundir,lib.id),"Assemble")
         elif lib.format == "sff":
             run_process(_settings, "%s/addRun %s %s/Assemble/out %s/Preprocess/out/lib%d.sff"%(_settings.NEWBLER, ("-p" if lib.mated else ""), _settings.rundir, _settings.rundir, lib.id), "Assemble")
         elif lib.format == "fastq":
             if (NEWBLER_VERSION < 2.6):
                print "Error: FASTQ + Newbler only supported in Newbler version 2.6+. You are using version %s."%(_settings.NEWBLER_VERSION)
                run_process(_settings, "touch %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "Assemble")
                _settings.kmer = originalKmer
                _settings.PREFIX = originalPrefix
                setFailFast(True)
                return
             run_process(_settings, "%s/addRun %s/Assemble/out %s/Preprocess/out/lib%d.fastq"%(_settings.NEWBLER, _settings.rundir, _settings.rundir, lib.id),"Assemble")

      newblerCmd = "%s%srunProject "%(_settings.NEWBLER, os.sep)
      # read spec file to input to newbler parameters
      newblerCmd += getProgramParams(_settings.METAMOS_UTILS, "newbler.spec", "", "-")
      run_process(_settings, "%s -cpu %d %s/Assemble/out"%(newblerCmd,_settings.threads,_settings.rundir),"Assemble")

      # unlike other assemblers, we can only get the preprocess info for newbler after assembly (since it has to split sff files by mates)
      extractNewblerReads()

      # convert to AMOS
      run_process(_settings, "cat %s/Assemble/out/assembly/454Contigs.ace |awk '{if (match($2, \"\\\\.\")) {STR= $1\" \"substr($2, 1, index($2, \".\")-1); for (i = 3; i <=NF; i++) STR= STR\" \"$i; print STR} else { print $0} }' > %s/Assemble/out/%s.ace"%(_settings.rundir, _settings.rundir,_settings.PREFIX), "Assemble") 
      run_process(_settings, "%s/toAmos -o %s/Assemble/out/%s.mates.afg -m %s/Preprocess/out/all.seq.mates -ace %s/Assemble/out/%s.ace"%(_settings.AMOS,_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.rundir, _settings.PREFIX),"Assemble")
      # get info on EID/IIDs for contigs
      run_process(_settings, "cat %s/Assemble/out/%s.mates.afg | grep -A 3 \"{CTG\" |awk '{if (match($1, \"iid\") != 0) {IID = $1} else if (match($1, \"eid\") != 0) {print $1\" \"IID; } }'|sed s/eid://g |sed s/iid://g > %s/Assemble/out/454eidToIID"%(_settings.rundir, _settings.PREFIX, _settings.rundir),"Assemble")
      run_process(_settings, "java -cp %s convert454GraphToCTL %s/Assemble/out/454eidToIID %s/Assemble/out/assembly/454ContigGraph.txt > %s/Assemble/out/%s.graph.cte"%(_settings.METAMOS_JAVA, _settings.rundir, _settings.rundir, _settings.rundir, _settings.PREFIX),"Assemble")
      run_process(_settings, "cat %s/Assemble/out/%s.mates.afg %s/Assemble/out/%s.graph.cte > %s/Assemble/out/%s.afg"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX),"Assemble")
    
      # make symlink for subsequent steps
      run_process(_settings, "rm %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX),"Assemble")
      run_process(_settings, "ln %s/Assemble/out/assembly/454AllContigs.fna %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.rundir, _settings.PREFIX),"Assemble")
      if _settings.doscaffolding and mated == True:
          run_process(_settings, "ln %s/Assemble/out/assembly/454Scaffolds.fna %s/Assemble/out/%s.linearize.scaffolds.final"%(_settings.rundir, _settings.rundir, _settings.PREFIX),"Assemble")

   elif asmName == "amos":
      run_process(_settings, "rm -rf %s/Assemble/in/%s.bnk"%(_settings.rundir, _settings.PREFIX), "Assemble")
      for lib in _readlibs:
         if lib.format == "fasta":
            run_process(_settings, "%s/toAmos_new -s %s/Preprocess/out/lib%d.seq -b %s/Assemble/in/%s.bnk "%(_settings.AMOS,_settings.rundir,lib.id,_settings.rundir, _settings.PREFIX),"Assemble")
         elif lib.format == "fastq":
            run_process(_settings, "%s/toAmos_new -Q %s/Preprocess/out/lib%d.seq -i --libname lib%d --min %d --max %d -b %s/Assemble/in/%s.bnk "%(_settings.AMOS,_settings.rundir,lib.id,lib.id,lib.mean,lib.stdev,_settings.rundir,_settings.PREFIX),"Assemble")
      run_process(_settings, "%s/hash-overlap -B %s/Assemble/in/%s.bnk"%(_settings.AMOS, _settings.rundir, _settings.PREFIX), "Assemble")
      run_process(_settings, "%s/tigger -b %s/Assemble/in/%s.bnk"%(_settings.AMOS, _settings.rundir, _settings.PREFIX), "Assemble")
      run_process(_settings, "%s/make-consensus -B -b %s/Assemble/in/%s.bnk"%(_settings.AMOS, _settings.rundir, _settings.PREFIX), "Assemble")
      run_process(_settings, "%s/bank2fasta -b %s/Assemble/in/%s.bnk > %s.asm.contig"%(_settings.AMOS, _settings.rundir, _settings.PREFIX, _settings.PREFIX), "Assemble")
   elif asmName.lower() == "ca":
      #runCA script
      frglist = ""
      matedString = ""
      for lib in _readlibs:
         if not os.path.exists("%s/Preprocess/out/lib%d.frg"%(_settings.rundir, lib.id)):
            if lib.format == "fastq":
               if lib.mated:
                  matedString = "-insertsize %d %d -%s -mates"%(lib.mean, lib.stdev, "innie" if lib.innie else "outtie") 
               else:
                  matedString = "-reads"
               run_process(_settings, "%s/fastqToCA -libraryname %s -technology illumina-long %s %s/Preprocess/out/lib%d.seq > %s/Preprocess/out/lib%d.frg"%(_settings.CA, lib.sid, matedString, _settings.rundir, lib.id, _settings.rundir, lib.id),"Assemble")
            elif lib.format == "fasta":
               if lib.mated:
                  matedString = "-mean %d -stddev %d -m %s/Preprocess/out/lib%d.seq.mates"%(lib.mean, lib.stdev, _settings.rundir, lib.id)
               run_process(_settings, "%s/convert-fasta-to-v2.pl -l %s %s -s %s/Preprocess/out/lib%d.seq -q %s/Preprocess/out/lib%d.seq.qual > %s/Preprocess/out/lib%d.frg"%(_settings.CA, lib.sid, matedString, _settings.rundir, lib.id, _settings.rundir, lib.id, _settings.rundir, lib.id),"Assemble")
         frglist += "%s/Preprocess/out/lib%d.frg "%(_settings.rundir, lib.id)

      specFile="%s/config/asm.spec"%(_settings.METAMOS_UTILS)
      if os.path.exists("%s/Assemble/out/asm.spec"%(_settings.rundir)):
         specFile="%s/Assemble/out/asm.spec"%(_settings.rundir)
      run_process(_settings, "%s/runCA -p %s -d %s/Assemble/out/ -s %s %s %s"%(_settings.CA,_settings.PREFIX,_settings.rundir,specFile,"" if _settings.doscaffolding else "stopAfter=utgcns", frglist),"Assemble")
      #convert CA to AMOS
      run_process(_settings, "%s/gatekeeper -dumpfrg -allreads %s.gkpStore > %s.frg"%(_settings.CA, _settings.PREFIX, _settings.PREFIX),"Assemble")
      if _settings.doscaffolding: 
         run_process(_settings, "ln 9-terminator/%s.ctg.fasta %s.asm.contig"%(_settings.PREFIX, _settings.PREFIX), "Assemble")
         run_process(_settings, "ln 9-terminator/%s.scf.fasta %s.linearize.scaffolds.final"%(_settings.PREFIX, _settings.PREFIX), "Assemble")
      else:
         run_process(_settings, "%s/terminator -g %s.gkpStore -t %s.tigStore/ 2 -o %s"%(_settings.CA, _settings.PREFIX, _settings.PREFIX, _settings.PREFIX),"Assemble")
         run_process(_settings, "%s/asmOutputFasta -p %s < %s.asm"%(_settings.CA, _settings.PREFIX, _settings.PREFIX), "Assemble")
         run_process(_settings, "ln %s.utg.fasta %s.asm.contig"%(_settings.PREFIX, _settings.PREFIX), "Assemble")
   elif asmName == "velvet":
      runVelvet(_settings.VELVET, "velvet")
   elif asmName == "velvet-sc":
      runVelvet(_settings.VELVET_SC, "velvet-sc")
   elif asmName == "metavelvet":
      runMetaVelvet(_settings.VELVET, _settings.METAVELVET, "metavelvet")
   elif asmName.lower() == "sparseassembler":
      runSparseAssembler(_settings.SPARSEASSEMBLER, "SparseAssembler");
   elif generic.checkIfExists(STEP_NAMES.ASSEMBLE, asmName.lower()):
       generic.execute(STEP_NAMES.ASSEMBLE, asmName.lower(), _settings)
   else:  
      print "Error: %s is an unknown assembler. No valid assembler specified."%(asmName)

   if not os.path.exists("%s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX)):
      run_process(_settings, "touch %s/Assemble/out/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "Assemble")
   _settings.kmer = originalKmer
   _settings.PREFIX = originalPrefix
   setFailFast(True)
Exemplo n.º 2
0
def Classify(input,output):
   setFailFast(False)
   if "Classify" in _skipsteps or _cls == None:
      run_process(_settings, "touch %s/Logs/annotate.skip"%(_settings.rundir), "Classify")
      run_process(_settings, "touch %s/Classify/out/%s.hits"%(_settings.rundir, _settings.PREFIX), "Classify")
      run_process(_settings, "touch %s/Classify/out/%s.annots"%(_settings.rundir, _settings.PREFIX), "Classify")
      return 0

   listOfFiles = "%s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX)

   # clean up any existing files
   run_process(_settings, "touch %s/Classify/out/%s.annots"%(_settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "unlink %s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "ln %s/Assemble/out/%s.asm.contig %s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "unlink %s/Classify/out/%s.hits"%(_settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "rm -f %s/Classify/out/*.hits"%(_settings.rundir), "Classify")
   run_process(_settings, "rm -f %s/Classify/out/*.epsilon-nb_results.txt"%(_settings.rundir), "Classify")
   run_process(_settings, "rm -f %s/Classify/out/*.phymm.out"%(_settings.rundir), "Classify")

   pool = Pool(processes=_settings.threads)
   tasks = []

   if "fcp" in _cls or "phymm" in _cls:
      # hack to use gridX
      if _USE_GRID:
         size = sizeFastaFile("%s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX))
         perThread = ceil(float(size) / 200)
         #print "The size of the contigs is %d per thread %d\n"%(size, perThread)
         #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/%s.asm.contig %d %s/Classify/in/%s %d"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, perThread, _settings.rundir, _settings.PREFIX, 1), "Classify")
         splitfasta("%s/Classify/in/%s.asm.contig"%(_settings.rundir,_settings.PREFIX),"%d"%(perThread),"%s/Classify/in/%s"%(_settings.rundir,_settings.PREFIX),"%d"%(1))
         totalJobs = 0
         for partFile in os.listdir("%s/Classify/in/"%(_settings.rundir)):
            if "_part" in partFile and "%s_part"%(_settings.PREFIX) in partFile:
               print "A file I have to process is %s\n"%(partFile)
               totalJobs += 1

         for lib in _readlibs:
            listOfFiles += ":%s/Assemble/out/lib%d.unaligned.fasta"%(_settings.rundir, lib.id)
            run_process(_settings, "ln %s/Assemble/out/lib%d.unaligned.fasta %s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir, lib.id, _settings.rundir, lib.id), "Classify")
            size = sizeFastaFile("%s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir, lib.id))
            perThread = ceil(float(size) / 200)
            #print "The size of the lib %d is %d per one %d\n"%(lib.id, size, perThread)
            #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/lib%d.unaligned.fasta %d %s/Classify/in/%s %d"%(_settings.METAMOS_UTILS, _settings.rundir, lib.id, perThread, _settings.rundir, _settings.PREFIX, totalJobs+1), "Classify")
            #splitfasta("%s/Classify/in/%s.asm.contig,%d,%s/Classify/in/%s,%d"%(_settings.rundir,_settings.PREFIX,perThread,_settings.rundir,_settings.PREFIX,totalJobs+1))
            splitfasta("%s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir,lib.id),"%d"%(perThread),"%s/Classify/in/%s"%(_settings.rundir,_settings.PREFIX),"%d"%(totalJobs+1))

         totalJobs = 0
         for partFile in os.listdir("%s/Classify/in/"%(_settings.rundir)):
            if "_part" in partFile and "%s_part"%(_settings.PREFIX) in partFile:
               #print "A file I have to process is %s\n"%(partFile)
               totalJobs += 1

         cmdfile = open("%s/Classify/out/runAnnot.sh"%(_settings.rundir), "w")
         cmdfile.write("#!/bin/sh\n")
         cmdfile.write("\n")
         cmdfile.write("jobid=$GRID_TASK\n")
         cmdfile.write("if [ x$jobid = x -o x$jobid = xundefined -o x$jobid = 0 ]; then\n")
         cmdfile.write("   jobid=$1\n")
         cmdfile.write("fi\n")
         cmdfile.write("if test x$jobid = x; then\n")
         cmdfile.write("  echo Error: I need a job index on the command line\n")
         cmdfile.write("  exit 1\n")
         cmdfile.write("fi\n")
         cmdfile.write("if [ $jobid -gt %d ]; then\n"%(totalJobs))
         cmdfile.write("   echo Job id $jobid is out of range %d\n"%(totalJobs))
         cmdfile.write("   exit 0\n")
         cmdfile.write("fi\n")
         cmdfile.write("if test -e %s/Classify/out/$jobid.success ; then\n"%(_settings.rundir))
         cmdfile.write("   echo Job previously completed successfully.\n")
         cmdfile.write("else\n")
         cmdfile.write("ln -s %s/.blastData\n"%(_settings.PHYMM))
         cmdfile.write("ln -s %s/.genomeData\n"%(_settings.PHYMM))
         cmdfile.write("ln -s %s/.scripts\n"%(_settings.PHYMM))
         cmdfile.write("ln -s %s/.taxonomyData\n"%(_settings.PHYMM))
         cmdfile.write("mkdir .logs\n")
         cmdfile.write("perl %s/scoreReads.pl %s/Classify/in/%s_part$jobid.fa"%(_settings.PHYMM,_settings.rundir,_settings.PREFIX))
         cmdfile.write(" && touch %s/Classify/out/$jobid.success\n"%(_settings.rundir))
         cmdfile.write("fi\n")
         cmdfile.close()
         run_process(_settings, "chmod u+x %s/Classify/out/runAnnot.sh"%(_settings.rundir), "Classify")

         #run_process(_settings, "gridx -p %d -r %d -T -c %s/Classify/out/runAnnot.sh"%(min(totalJobs+1, 200), totalJobs+1, _settings.rundir), "Classify")
         run_process(_settings, "cat %s/Classify/out/gridx-ibissub00*/wrk_*/results.03.phymmBL_%s_Annotate_in_*%s* | grep -v \"QUERY_ID\" > %s/Classify/out/%s.phymm.out"%(_settings.rundir, _settings.rundir.replace(os.sep, "_").replace(".", "_"), _settings.PREFIX, _settings.rundir, _settings.PREFIX))
         run_process(_settings, "ln %s/Classify/out/%s.phymm.out %s/Classify/out/%s.hits"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX))

         # for now we only work as phymm
         # generate Krona output ImportPhymmBL.pl
         importPhymm = "%s%sperl%sImportPhymmBL.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep)
         if not os.path.exists(importPhymm):
            print "Error: Krona importer for Phymm not found in %s. Please check your path and try again.\n"%(importPhymm)
            raise(JobSignalledBreak)
         run_process(_settings, "perl %s %s -f %s %s/Classify/out/%s.phymm.out:%s/Assemble/out/%s.contig.cnt:%s"%(importPhymm, "-l" if _settings.local_krona else "", listOfFiles,_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.taxa_level),"Classify") # TODO: local url (after next KronaTools release)

         # generate taxonomic-level annots
         readctg_dict = {}
         for lib in _readlibs:
            ctgfile = open("%s/Assemble/out/%s.lib%dcontig.reads"%(_settings.rundir, _settings.PREFIX, lib.id), 'r')
            for line in ctgfile.xreadlines():
               line = line.replace("\n","")
               read, ctg = line.split()
               if ctg in readctg_dict:
                  readctg_dict[ctg].append(read)
               else:
                  readctg_dict[ctg] = [read,]
            ctgfile.close()

         annotsfile = open("%s/Classify/out/%s.annots"%(_settings.rundir, _settings.PREFIX), 'r')
         annotreads = open("%s/Classify/out/%s.reads.annots"%(_settings.rundir, _settings.PREFIX), 'w')
         for line in annotsfile.xreadlines():
            line = line.replace("\n", "")
            ctg, annot = line.split()
            if ctg in readctg_dict:
               for x in readctg_dict[ctg]:
                  annotreads.write("%s\t%s\n"%(x, annot))
            else:
               annotreads.write("%s\t%s\n"%(ctg, annot))
         annotsfile.close()
         annotreads.close()
         readctg_dict.clear()

         return 
      # we should also split the fna and faa file but for now this is good enough
      size = sizeFastaFile("%s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX))
      perThread = max(ceil(float(size) / _settings.threads), _MIN_SEQ_LENGTH)
      #print "The size of the contigs is %d per thread %d\n"%(size, perThread)
      #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/%s.asm.contig %d"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, perThread), "Classify")
      #splitfasta("%s/Classify/in/%s.asm.contig,%d,%s/Classify/in/%s,%d"%(_settings.rundir,_settings.PREFIX,perThread,_settings.rundir,_settings.PREFIX,1))
      splitfasta("%s/Classify/in/%s.asm.contig"%(_settings.rundir,_settings.PREFIX),"%d"%(perThread))
      for partFile in os.listdir("%s/Classify/in/"%(_settings.rundir)):
         if "_part" in partFile and "%s.asm.contig"%(_settings.PREFIX) in partFile:
            partStart = partFile.find("_part")+5
            partEnd = partFile.find(".fa", partStart, len(partFile))
            partNumber = int(partFile[partStart:partEnd])
            params = {}
            params["jobID"] = len(tasks) 
            params["cls"] = _cls
            params["contigs"] = "%s/Classify/in/%s"%(_settings.rundir, partFile)
            params["orfAA"] = ""
            params["orfFA"] = ""
            params["out"] = "%s.ctg_%d"%(_settings.PREFIX, partNumber)
            tasks.append(params) 
   else:
      annotateSeq(_cls, "%s/Classify/in/%s.asm.contig"%(_settings.rundir, _settings.PREFIX), "%s/Classify/in/%s.faa"%(_settings.rundir, _settings.PREFIX), "%s/Classify/in/%s.fna"%(_settings.rundir, _settings.PREFIX), "%s.ctg"%(_settings.PREFIX))

   # annotate all the unmapped sequences using FCP
   if _cls == "blast" or _cls == "phmmer" or _cls == "metaphyler" or not _settings.classify_unmapped:
      #print "Warning: blast, PHMMER, and metaphyler is not supported for annotating unmapped sequences"
      #print "Warning: unmapped/unaligned sequences will not be annotated!"
      pass
   else:
      for lib in _readlibs:
         listOfFiles += ":%s/Assemble/out/lib%d.unaligned.fasta"%(_settings.rundir, lib.id)
         run_process(_settings, "ln %s/Assemble/out/lib%d.unaligned.fasta %s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir, lib.id, _settings.rundir, lib.id), "Classify")

         if "fcp" in _cls or "phymm" in _cls:
            size = sizeFastaFile("%s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir, lib.id))
            perThread = max(ceil(float(size) / _settings.threads), _MIN_SEQ_LENGTH)
            #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/lib%d.unaligned.fasta %d"%(_settings.METAMOS_UTILS, _settings.rundir, lib.id, perThread), "Classify")
            splitfasta("%s/Classify/in/lib%d.unaligned.fasta"%(_settings.rundir,lib.id),"%d"%(perThread))
            for partFile in os.listdir("%s/Classify/in/"%(_settings.rundir)):
               if "_part" in partFile and "lib%d.unaligned.fasta"%(lib.id) in partFile:
                  partStart = partFile.find("_part")+5
                  partEnd = partFile.find(".fa", partStart, len(partFile))
                  partNumber = int(partFile[partStart:partEnd])
                  params = {}
                  params["jobID"] = len(tasks)
                  params["cls"] = _cls
                  params["contigs"] = "%s/Classify/in/%s"%(_settings.rundir, partFile)
                  params["orfAA"] = ""
                  params["orfFA"] = ""
                  params["out"] = "%s.lib%d_%d"%(_settings.PREFIX, lib.id, partNumber)
                  tasks.append(params)
         else:
            annotateSeq(_cls, "%s/Assemble/out/lib%d.unaligned.fasta"%(_settings.rundir, lib.id), "", "", "%s.lib%d"%(_settings.PREFIX, lib.id))
   if "fcp" in _cls or "phymm" in _cls:
         result = pool.map_async(parallelWrapper, tasks).get(sys.maxint)
         for i in result:
            if (i["status"] == 1):
               run_process(_settings, "rm %s"%(tasks[i["jobID"]]["contigs"]), "Classify")
            else:
               print "Error: parallel annotation job %d failed\n"%(i["jobID"])
               raise(JobSignalledBreak)
   pool.close()
   pool.join()

   if generic.checkIfExists(STEP_NAMES.ANNOTATE, _cls.lower()):
      generic.execute(STEP_NAMES.ANNOTATE, _cls.lower(), _settings)
   else:
      #  merge results
      run_process(_settings, "cat %s/Classify/out/*.intermediate.hits > %s/Classify/out/%s.hits"%(_settings.rundir, _settings.rundir, _settings.PREFIX), "Classify")
 
   if _cls == "phylosift":
       importPS = "%s%sperl%sImportPhyloSift.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep)
       if not os.path.exists(importPS):
           print "Error: Krona importer for PhyloSift not found in %s. Please check your path and try again.\n"%(_settings.KRONA)
           raise(JobSignalledBreak)
       run_process(_settings, "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"%(importPS,"-l" if _settings.local_krona else "",listOfFiles,_settings.rundir,_settings.PREFIX,_settings.rundir,_settings.PREFIX, _settings.taxa_level), "Classify")

   elif _cls == "fcp":
       # generate Krona output
       importFCP = "%s%sperl%sImportFCP.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep)
       if not os.path.exists(importFCP):
          print "Error: Krona importer for FCP not found in %s. Please check your path and try again.\n"%(importFCP)
          raise(JobSignalledBreak)
       run_process(_settings, "cat %s/Classify/out/*.intermediate.epsilon-nb_results.txt | grep -v 'Fragment Id' > %s/Classify/out/%s.epsilon-nb_results.txt"%(_settings.rundir, _settings.rundir, _settings.PREFIX), "Classify")

       run_process(_settings, "perl %s %s -c -i -f %s %s/Classify/out/%s.epsilon-nb_results.txt:%s/Assemble/out/%s.contig.cnt:%s"%(importFCP, "-l" if _settings.local_krona else "", listOfFiles, _settings.rundir,_settings.PREFIX,_settings.rundir, _settings.PREFIX, _settings.taxa_level),"Classify") # TODO: local url (after next KronaTools release)

   elif _cls == "phymm":
       # generate Krona output ImportPhymmBL.pl
       importPhymm = "%s%sperl%sImportPhymmBL.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep)
       if not os.path.exists(importPhymm):
          print "Error: Krona importer for Phymm not found in %s. Please check your path and try again.\n"%(importPhymm)
          raise(JobSignalledBreak)
       run_process(_settings, "cat %s/Classify/out/*.intermediate.phymm.out > %s/Classify/out/%s.phymm.out"%(_settings.rundir, _settings.rundir, _settings.PREFIX), "Classify")
       run_process(_settings, "perl %s %s -f %s %s/Classify/out/%s.phymm.out:%s/Assemble/out/%s.contig.cnt:%s"%(importPhymm, "-l" if _settings.local_krona else "", listOfFiles,_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX, _settings.taxa_level),"Classify") # TODO: local url (after next KronaTools release)
   elif generic.checkIfExists(STEP_NAMES.ANNOTATE, _cls.lower()):
      genericImport = "%s%sperl%sImport%s.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep, _cls.title())
      if os.path.exists(genericImport):
         run_process(_settings, "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"%(genericImport, "-l" if _settings.local_krona else "", listOfFiles, _settings.rundir,_settings.PREFIX,_settings.rundir, _settings.PREFIX, _settings.taxa_level),"Classify") # TODO: local url (after next KronaTools release)
      else:
         genericImport = "%s%sperl%sImportGeneric.pl"%(_settings.METAMOS_UTILS, os.sep, os.sep)
         if not os.path.exists(genericImport):
            print "Error: Krona importer for generic classifier not found in %s. Please check your path and try again.\n"%(genericImport)
            raise(JobSignalledBreak)
         run_process(_settings, "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"%(genericImport, "-l" if _settings.local_krona else "", listOfFiles, _settings.rundir,_settings.PREFIX,_settings.rundir, _settings.PREFIX, _settings.taxa_level),"Classify") # TODO: local url (after next KronaTools release)


   run_process(_settings, "unlink %s/Postprocess/in/%s.hits"%(_settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "unlink %s/Postprocess/out/%s.hits"%(_settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "ln %s/Classify/out/%s.hits %s/Postprocess/in/%s.hits"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX), "Classify")
   run_process(_settings, "ln %s/Classify/out/%s.hits %s/Postprocess/out/%s.hits"%(_settings.rundir, _settings.PREFIX, _settings.rundir, _settings.PREFIX), "Classify")

   # generate taxonomic-level annots
   readctg_dict = {}
   for lib in _readlibs:
      ctgfile = open("%s/Assemble/out/%s.lib%dcontig.reads"%(_settings.rundir, _settings.PREFIX, lib.id), 'r')
      for line in ctgfile.xreadlines():
         line = line.replace("\n","")
         read, ctg = line.split()
         if ctg in readctg_dict:
            readctg_dict[ctg].append(read)
         else:
            readctg_dict[ctg] = [read,]
      ctgfile.close()

   annotsfile = open("%s/Classify/out/%s.annots"%(_settings.rundir, _settings.PREFIX), 'r')
   annotreads = open("%s/Classify/out/%s.reads.annots"%(_settings.rundir, _settings.PREFIX), 'w')
   for line in annotsfile.xreadlines():
     line = line.replace("\n", "")
     ctg, annot = line.split()
     if ctg in readctg_dict:
        for x in readctg_dict[ctg]:
           annotreads.write("%s\t%s\n"%(x, annot))
     else:
        annotreads.write("%s\t%s\n"%(ctg, annot))
   annotsfile.close()
   annotreads.close()
   readctg_dict.clear()
Exemplo n.º 3
0
def Classify(input, output):
    setFailFast(False)
    if "Classify" in _skipsteps or _cls == None:
        run_process(_settings,
                    "touch %s/Logs/annotate.skip" % (_settings.rundir),
                    "Classify")
        run_process(
            _settings, "touch %s/Classify/out/%s.hits" %
            (_settings.rundir, _settings.PREFIX), "Classify")
        run_process(
            _settings, "touch %s/Classify/out/%s.annots" %
            (_settings.rundir, _settings.PREFIX), "Classify")
        return 0

    listOfFiles = "%s/Classify/in/%s.asm.contig" % (_settings.rundir,
                                                    _settings.PREFIX)

    # clean up any existing files
    run_process(
        _settings, "touch %s/Classify/out/%s.annots" %
        (_settings.rundir, _settings.PREFIX), "Classify")
    run_process(
        _settings, "unlink %s/Classify/in/%s.asm.contig" %
        (_settings.rundir, _settings.PREFIX), "Classify")
    run_process(
        _settings,
        "ln %s/Assemble/out/%s.asm.contig %s/Classify/in/%s.asm.contig" %
        (_settings.rundir, _settings.PREFIX, _settings.rundir,
         _settings.PREFIX), "Classify")
    run_process(
        _settings, "unlink %s/Classify/out/%s.hits" %
        (_settings.rundir, _settings.PREFIX), "Classify")
    run_process(_settings, "rm -f %s/Classify/out/*.hits" % (_settings.rundir),
                "Classify")
    run_process(
        _settings,
        "rm -f %s/Classify/out/*.epsilon-nb_results.txt" % (_settings.rundir),
        "Classify")
    run_process(_settings,
                "rm -f %s/Classify/out/*.phymm.out" % (_settings.rundir),
                "Classify")

    pool = Pool(processes=_settings.threads)
    tasks = []

    if "fcp" in _cls or "phymm" in _cls:
        # hack to use gridX
        if _USE_GRID:
            size = sizeFastaFile("%s/Classify/in/%s.asm.contig" %
                                 (_settings.rundir, _settings.PREFIX))
            perThread = ceil(float(size) / 200)
            #print "The size of the contigs is %d per thread %d\n"%(size, perThread)
            #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/%s.asm.contig %d %s/Classify/in/%s %d"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, perThread, _settings.rundir, _settings.PREFIX, 1), "Classify")
            splitfasta(
                "%s/Classify/in/%s.asm.contig" %
                (_settings.rundir, _settings.PREFIX), "%d" % (perThread),
                "%s/Classify/in/%s" % (_settings.rundir, _settings.PREFIX),
                "%d" % (1))
            totalJobs = 0
            for partFile in os.listdir("%s/Classify/in/" % (_settings.rundir)):
                if "_part" in partFile and "%s_part" % (
                        _settings.PREFIX) in partFile:
                    print "A file I have to process is %s\n" % (partFile)
                    totalJobs += 1

            for lib in _readlibs:
                listOfFiles += ":%s/Assemble/out/lib%d.unaligned.fasta" % (
                    _settings.rundir, lib.id)
                run_process(
                    _settings,
                    "ln %s/Assemble/out/lib%d.unaligned.fasta %s/Classify/in/lib%d.unaligned.fasta"
                    % (_settings.rundir, lib.id, _settings.rundir, lib.id),
                    "Classify")
                size = sizeFastaFile("%s/Classify/in/lib%d.unaligned.fasta" %
                                     (_settings.rundir, lib.id))
                perThread = ceil(float(size) / 200)
                #print "The size of the lib %d is %d per one %d\n"%(lib.id, size, perThread)
                #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/lib%d.unaligned.fasta %d %s/Classify/in/%s %d"%(_settings.METAMOS_UTILS, _settings.rundir, lib.id, perThread, _settings.rundir, _settings.PREFIX, totalJobs+1), "Classify")
                #splitfasta("%s/Classify/in/%s.asm.contig,%d,%s/Classify/in/%s,%d"%(_settings.rundir,_settings.PREFIX,perThread,_settings.rundir,_settings.PREFIX,totalJobs+1))
                splitfasta(
                    "%s/Classify/in/lib%d.unaligned.fasta" %
                    (_settings.rundir, lib.id), "%d" % (perThread),
                    "%s/Classify/in/%s" % (_settings.rundir, _settings.PREFIX),
                    "%d" % (totalJobs + 1))

            totalJobs = 0
            for partFile in os.listdir("%s/Classify/in/" % (_settings.rundir)):
                if "_part" in partFile and "%s_part" % (
                        _settings.PREFIX) in partFile:
                    #print "A file I have to process is %s\n"%(partFile)
                    totalJobs += 1

            cmdfile = open("%s/Classify/out/runAnnot.sh" % (_settings.rundir),
                           "w")
            cmdfile.write("#!/bin/sh\n")
            cmdfile.write("\n")
            cmdfile.write("jobid=$GRID_TASK\n")
            cmdfile.write(
                "if [ x$jobid = x -o x$jobid = xundefined -o x$jobid = 0 ]; then\n"
            )
            cmdfile.write("   jobid=$1\n")
            cmdfile.write("fi\n")
            cmdfile.write("if test x$jobid = x; then\n")
            cmdfile.write(
                "  echo Error: I need a job index on the command line\n")
            cmdfile.write("  exit 1\n")
            cmdfile.write("fi\n")
            cmdfile.write("if [ $jobid -gt %d ]; then\n" % (totalJobs))
            cmdfile.write("   echo Job id $jobid is out of range %d\n" %
                          (totalJobs))
            cmdfile.write("   exit 0\n")
            cmdfile.write("fi\n")
            cmdfile.write(
                "if test -e %s/Classify/out/$jobid.success ; then\n" %
                (_settings.rundir))
            cmdfile.write("   echo Job previously completed successfully.\n")
            cmdfile.write("else\n")
            cmdfile.write("ln -s %s/.blastData\n" % (_settings.PHYMM))
            cmdfile.write("ln -s %s/.genomeData\n" % (_settings.PHYMM))
            cmdfile.write("ln -s %s/.scripts\n" % (_settings.PHYMM))
            cmdfile.write("ln -s %s/.taxonomyData\n" % (_settings.PHYMM))
            cmdfile.write("mkdir .logs\n")
            cmdfile.write(
                "perl %s/scoreReads.pl %s/Classify/in/%s_part$jobid.fa" %
                (_settings.PHYMM, _settings.rundir, _settings.PREFIX))
            cmdfile.write(" && touch %s/Classify/out/$jobid.success\n" %
                          (_settings.rundir))
            cmdfile.write("fi\n")
            cmdfile.close()
            run_process(
                _settings,
                "chmod u+x %s/Classify/out/runAnnot.sh" % (_settings.rundir),
                "Classify")

            #run_process(_settings, "gridx -p %d -r %d -T -c %s/Classify/out/runAnnot.sh"%(min(totalJobs+1, 200), totalJobs+1, _settings.rundir), "Classify")
            run_process(
                _settings,
                "cat %s/Classify/out/gridx-ibissub00*/wrk_*/results.03.phymmBL_%s_Annotate_in_*%s* | grep -v \"QUERY_ID\" > %s/Classify/out/%s.phymm.out"
                % (_settings.rundir, _settings.rundir.replace(
                    os.sep, "_").replace(".", "_"), _settings.PREFIX,
                   _settings.rundir, _settings.PREFIX))
            run_process(
                _settings,
                "ln %s/Classify/out/%s.phymm.out %s/Classify/out/%s.hits" %
                (_settings.rundir, _settings.PREFIX, _settings.rundir,
                 _settings.PREFIX))

            # for now we only work as phymm
            # generate Krona output ImportPhymmBL.pl
            importPhymm = "%s%sperl%sImportPhymmBL.pl" % (
                _settings.METAMOS_UTILS, os.sep, os.sep)
            if not os.path.exists(importPhymm):
                print "Error: Krona importer for Phymm not found in %s. Please check your path and try again.\n" % (
                    importPhymm)
                raise (JobSignalledBreak)
            run_process(
                _settings,
                "perl %s %s -f %s %s/Classify/out/%s.phymm.out:%s/Assemble/out/%s.contig.cnt:%s"
                % (importPhymm, "-l" if _settings.local_krona else "",
                   listOfFiles, _settings.rundir, _settings.PREFIX,
                   _settings.rundir, _settings.PREFIX, _settings.taxa_level),
                "Classify")  # TODO: local url (after next KronaTools release)

            # generate taxonomic-level annots
            readctg_dict = {}
            for lib in _readlibs:
                ctgfile = open(
                    "%s/Assemble/out/%s.lib%dcontig.reads" %
                    (_settings.rundir, _settings.PREFIX, lib.id), 'r')
                for line in ctgfile.xreadlines():
                    line = line.replace("\n", "")
                    read, ctg = line.split()
                    if ctg in readctg_dict:
                        readctg_dict[ctg].append(read)
                    else:
                        readctg_dict[ctg] = [
                            read,
                        ]
                ctgfile.close()

            annotsfile = open(
                "%s/Classify/out/%s.annots" %
                (_settings.rundir, _settings.PREFIX), 'r')
            annotreads = open(
                "%s/Classify/out/%s.reads.annots" %
                (_settings.rundir, _settings.PREFIX), 'w')
            for line in annotsfile.xreadlines():
                line = line.replace("\n", "")
                ctg, annot = line.split()
                if ctg in readctg_dict:
                    for x in readctg_dict[ctg]:
                        annotreads.write("%s\t%s\n" % (x, annot))
                else:
                    annotreads.write("%s\t%s\n" % (ctg, annot))
            annotsfile.close()
            annotreads.close()
            readctg_dict.clear()

            return
        # we should also split the fna and faa file but for now this is good enough
        size = sizeFastaFile("%s/Classify/in/%s.asm.contig" %
                             (_settings.rundir, _settings.PREFIX))
        perThread = max(ceil(float(size) / _settings.threads), _MIN_SEQ_LENGTH)
        #print "The size of the contigs is %d per thread %d\n"%(size, perThread)
        #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/%s.asm.contig %d"%(_settings.METAMOS_UTILS, _settings.rundir, _settings.PREFIX, perThread), "Classify")
        #splitfasta("%s/Classify/in/%s.asm.contig,%d,%s/Classify/in/%s,%d"%(_settings.rundir,_settings.PREFIX,perThread,_settings.rundir,_settings.PREFIX,1))
        splitfasta(
            "%s/Classify/in/%s.asm.contig" %
            (_settings.rundir, _settings.PREFIX), "%d" % (perThread))
        for partFile in os.listdir("%s/Classify/in/" % (_settings.rundir)):
            if "_part" in partFile and "%s.asm.contig" % (
                    _settings.PREFIX) in partFile:
                partStart = partFile.find("_part") + 5
                partEnd = partFile.find(".fa", partStart, len(partFile))
                partNumber = int(partFile[partStart:partEnd])
                params = {}
                params["jobID"] = len(tasks)
                params["cls"] = _cls
                params["contigs"] = "%s/Classify/in/%s" % (_settings.rundir,
                                                           partFile)
                params["orfAA"] = ""
                params["orfFA"] = ""
                params["out"] = "%s.ctg_%d" % (_settings.PREFIX, partNumber)
                tasks.append(params)
    else:
        annotateSeq(
            _cls, "%s/Classify/in/%s.asm.contig" %
            (_settings.rundir, _settings.PREFIX),
            "%s/Classify/in/%s.faa" % (_settings.rundir, _settings.PREFIX),
            "%s/Classify/in/%s.fna" % (_settings.rundir, _settings.PREFIX),
            "%s.ctg" % (_settings.PREFIX))

    # annotate all the unmapped sequences using FCP
    if _cls == "blast" or _cls == "phmmer" or _cls == "metaphyler" or not _settings.classify_unmapped:
        #print "Warning: blast, PHMMER, and metaphyler is not supported for annotating unmapped sequences"
        #print "Warning: unmapped/unaligned sequences will not be annotated!"
        pass
    else:
        for lib in _readlibs:
            listOfFiles += ":%s/Assemble/out/lib%d.unaligned.fasta" % (
                _settings.rundir, lib.id)
            run_process(
                _settings,
                "ln %s/Assemble/out/lib%d.unaligned.fasta %s/Classify/in/lib%d.unaligned.fasta"
                % (_settings.rundir, lib.id, _settings.rundir, lib.id),
                "Classify")

            if "fcp" in _cls or "phymm" in _cls:
                size = sizeFastaFile("%s/Classify/in/lib%d.unaligned.fasta" %
                                     (_settings.rundir, lib.id))
                perThread = max(ceil(float(size) / _settings.threads),
                                _MIN_SEQ_LENGTH)
                #run_process(_settings, "python %s/python/splitfasta.py %s/Classify/in/lib%d.unaligned.fasta %d"%(_settings.METAMOS_UTILS, _settings.rundir, lib.id, perThread), "Classify")
                splitfasta(
                    "%s/Classify/in/lib%d.unaligned.fasta" %
                    (_settings.rundir, lib.id), "%d" % (perThread))
                for partFile in os.listdir("%s/Classify/in/" %
                                           (_settings.rundir)):
                    if "_part" in partFile and "lib%d.unaligned.fasta" % (
                            lib.id) in partFile:
                        partStart = partFile.find("_part") + 5
                        partEnd = partFile.find(".fa", partStart,
                                                len(partFile))
                        partNumber = int(partFile[partStart:partEnd])
                        params = {}
                        params["jobID"] = len(tasks)
                        params["cls"] = _cls
                        params["contigs"] = "%s/Classify/in/%s" % (
                            _settings.rundir, partFile)
                        params["orfAA"] = ""
                        params["orfFA"] = ""
                        params["out"] = "%s.lib%d_%d" % (_settings.PREFIX,
                                                         lib.id, partNumber)
                        tasks.append(params)
            else:
                annotateSeq(
                    _cls, "%s/Assemble/out/lib%d.unaligned.fasta" %
                    (_settings.rundir, lib.id), "", "",
                    "%s.lib%d" % (_settings.PREFIX, lib.id))
    if "fcp" in _cls or "phymm" in _cls:
        result = pool.map_async(parallelWrapper, tasks).get(sys.maxint)
        for i in result:
            if (i["status"] == 1):
                run_process(_settings,
                            "rm %s" % (tasks[i["jobID"]]["contigs"]),
                            "Classify")
            else:
                print "Error: parallel annotation job %d failed\n" % (
                    i["jobID"])
                raise (JobSignalledBreak)
    pool.close()
    pool.join()

    if generic.checkIfExists(STEP_NAMES.ANNOTATE, _cls.lower()):
        generic.execute(STEP_NAMES.ANNOTATE, _cls.lower(), _settings)
    else:
        #  merge results
        run_process(
            _settings,
            "cat %s/Classify/out/*.intermediate.hits > %s/Classify/out/%s.hits"
            % (_settings.rundir, _settings.rundir, _settings.PREFIX),
            "Classify")

    if _cls == "phylosift":
        importPS = "%s%sperl%sImportPhyloSift.pl" % (_settings.METAMOS_UTILS,
                                                     os.sep, os.sep)
        if not os.path.exists(importPS):
            print "Error: Krona importer for PhyloSift not found in %s. Please check your path and try again.\n" % (
                _settings.KRONA)
            raise (JobSignalledBreak)
        run_process(
            _settings,
            "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"
            % (importPS, "-l" if _settings.local_krona else "", listOfFiles,
               _settings.rundir, _settings.PREFIX, _settings.rundir,
               _settings.PREFIX, _settings.taxa_level), "Classify")

    elif _cls == "fcp":
        # generate Krona output
        importFCP = "%s%sperl%sImportFCP.pl" % (_settings.METAMOS_UTILS,
                                                os.sep, os.sep)
        if not os.path.exists(importFCP):
            print "Error: Krona importer for FCP not found in %s. Please check your path and try again.\n" % (
                importFCP)
            raise (JobSignalledBreak)
        run_process(
            _settings,
            "cat %s/Classify/out/*.intermediate.epsilon-nb_results.txt | grep -v 'Fragment Id' > %s/Classify/out/%s.epsilon-nb_results.txt"
            % (_settings.rundir, _settings.rundir, _settings.PREFIX),
            "Classify")

        run_process(
            _settings,
            "perl %s %s -c -i -f %s %s/Classify/out/%s.epsilon-nb_results.txt:%s/Assemble/out/%s.contig.cnt:%s"
            % (importFCP, "-l" if _settings.local_krona else "", listOfFiles,
               _settings.rundir, _settings.PREFIX, _settings.rundir,
               _settings.PREFIX, _settings.taxa_level),
            "Classify")  # TODO: local url (after next KronaTools release)

    elif _cls == "phymm":
        # generate Krona output ImportPhymmBL.pl
        importPhymm = "%s%sperl%sImportPhymmBL.pl" % (_settings.METAMOS_UTILS,
                                                      os.sep, os.sep)
        if not os.path.exists(importPhymm):
            print "Error: Krona importer for Phymm not found in %s. Please check your path and try again.\n" % (
                importPhymm)
            raise (JobSignalledBreak)
        run_process(
            _settings,
            "cat %s/Classify/out/*.intermediate.phymm.out > %s/Classify/out/%s.phymm.out"
            % (_settings.rundir, _settings.rundir, _settings.PREFIX),
            "Classify")
        run_process(
            _settings,
            "perl %s %s -f %s %s/Classify/out/%s.phymm.out:%s/Assemble/out/%s.contig.cnt:%s"
            % (importPhymm, "-l" if _settings.local_krona else "", listOfFiles,
               _settings.rundir, _settings.PREFIX, _settings.rundir,
               _settings.PREFIX, _settings.taxa_level),
            "Classify")  # TODO: local url (after next KronaTools release)
    elif generic.checkIfExists(STEP_NAMES.ANNOTATE, _cls.lower()):
        genericImport = "%s%sperl%sImport%s.pl" % (
            _settings.METAMOS_UTILS, os.sep, os.sep, _cls.title())
        if os.path.exists(genericImport):
            run_process(
                _settings,
                "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"
                % (genericImport, "-l" if _settings.local_krona else "",
                   listOfFiles, _settings.rundir, _settings.PREFIX,
                   _settings.rundir, _settings.PREFIX, _settings.taxa_level),
                "Classify")  # TODO: local url (after next KronaTools release)
        else:
            genericImport = "%s%sperl%sImportGeneric.pl" % (
                _settings.METAMOS_UTILS, os.sep, os.sep)
            if not os.path.exists(genericImport):
                print "Error: Krona importer for generic classifier not found in %s. Please check your path and try again.\n" % (
                    genericImport)
                raise (JobSignalledBreak)
            run_process(
                _settings,
                "perl %s %s -c -i -f %s %s/Classify/out/%s.hits:%s/Assemble/out/%s.contig.cnt:%s"
                % (genericImport, "-l" if _settings.local_krona else "",
                   listOfFiles, _settings.rundir, _settings.PREFIX,
                   _settings.rundir, _settings.PREFIX, _settings.taxa_level),
                "Classify")  # TODO: local url (after next KronaTools release)

    run_process(
        _settings, "unlink %s/Postprocess/in/%s.hits" %
        (_settings.rundir, _settings.PREFIX), "Classify")
    run_process(
        _settings, "unlink %s/Postprocess/out/%s.hits" %
        (_settings.rundir, _settings.PREFIX), "Classify")
    run_process(
        _settings, "ln %s/Classify/out/%s.hits %s/Postprocess/in/%s.hits" %
        (_settings.rundir, _settings.PREFIX, _settings.rundir,
         _settings.PREFIX), "Classify")
    run_process(
        _settings, "ln %s/Classify/out/%s.hits %s/Postprocess/out/%s.hits" %
        (_settings.rundir, _settings.PREFIX, _settings.rundir,
         _settings.PREFIX), "Classify")

    # generate taxonomic-level annots
    readctg_dict = {}
    for lib in _readlibs:
        ctgfile = open(
            "%s/Assemble/out/%s.lib%dcontig.reads" %
            (_settings.rundir, _settings.PREFIX, lib.id), 'r')
        for line in ctgfile.xreadlines():
            line = line.replace("\n", "")
            read, ctg = line.split()
            if ctg in readctg_dict:
                readctg_dict[ctg].append(read)
            else:
                readctg_dict[ctg] = [
                    read,
                ]
        ctgfile.close()

    annotsfile = open(
        "%s/Classify/out/%s.annots" % (_settings.rundir, _settings.PREFIX),
        'r')
    annotreads = open(
        "%s/Classify/out/%s.reads.annots" %
        (_settings.rundir, _settings.PREFIX), 'w')
    for line in annotsfile.xreadlines():
        line = line.replace("\n", "")
        ctg, annot = line.split()
        if ctg in readctg_dict:
            for x in readctg_dict[ctg]:
                annotreads.write("%s\t%s\n" % (x, annot))
        else:
            annotreads.write("%s\t%s\n" % (ctg, annot))
    annotsfile.close()
    annotreads.close()
    readctg_dict.clear()