예제 #1
0
def process_input(input, output, input_type , gene_list, append,  errorlogger = None):
    commentPATT = re.compile(r'^#')
    count = 0

    mode = 'w'
    if append:
       mode = 'a'

    gene_list = read_gene_list(gene_list)
    gene_dict = {}

    for gene in gene_list:
       gene_dict[gene.lower()] = gene # re.compile(r'[\/\s]' + gene + '[\/\s]') 

    if input_type=='LAST2':
      q = 0
      t = 9

    if input_type=='LAST1':
      q = 0
      t = 1

    if input_type=='HMM':
      q = 2
      t = 0

    try:
        inputfile = open(input, 'r') 
        outputfile = open(output, mode) 
    except:
        if errorlogger:
           errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))
        exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))


    for line in inputfile:
        result = commentPATT.search(line)
        if result:
           continue

        fields = [ x.strip() for x in line.split('\t') ]
        if len(fields) < 3:
           continue

        orfid = fields[q]

        #if input_type=='LAST1' or input_type=='LAST2':
        target = find_gene_name(fields[t], gene_list, gene_dict)

        if target==None:
           continue

        fprintf(outputfile, "%s\t%s\n",orfid, gene_dict[target]);


    outputfile.close()
    inputfile.close()
#    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)

    return count
def write_selected_sequences(selected_sequences, output_file_name):
    output_file = open(output_file_name, 'w')
    for read in selected_sequences:
        fprintf(output_file, ">%s\n", read)
        fprintf(output_file, "%s\n", selected_sequences[read])

    output_file.close()
예제 #3
0
      def consolidateSplitResults(self, P, split_results):
          sourceParentDir =  self.base_output_folder + PATHDELIM + P[0] + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' + PATHDELIM + 'split_results'
          targetParentDir =  self.base_output_folder + PATHDELIM + P[0] + PATHDELIM + 'blast_results' 
          targetFileName =  targetParentDir + PATHDELIM + P[0] + '.' + P[1] + '.' + self.algorithm +"out"

          try:
             targetfile = open( targetFileName, 'w')
          except:
             self.messagelogger.write("ERROR: Cannot create consolidated search results file %s!\n" %(targetFileName ))
             sys.exit(0)
          for filename in  split_results:
             sourceFileName = sourceParentDir + PATHDELIM + filename
             try:
                sourcefile = open(sourceFileName, 'r')
                resultLines = sourcefile.readlines()
                sourcefile.close()
             except:
                self.messagelogger.write("ERROR: Cannot create consolidated search results file %s!\n" %(sourceFileName ))
                sys.exit(0)

             try:
                for line in resultLines:
                    fprintf(targetfile, "%s", line)
             except:
                self.messagelogger.write("ERROR: Cannot write result from file %s to the consolidated file!\n" %(sourceFileName ))
                sys.exit(0)

          self.messagelogger.write("SUCCESS: Successfully consolidated search results into file %s!\n" %(targetFileName ))
          targetfile.close()

          """ Now delete the consolidates split_files files """ 
          for filename in  split_results:
             sourceFileName = sourceParentDir + PATHDELIM + filename
             os.remove(sourceFileName)
def write_selected_sequences(selected_sequences, output_file_name):
    output_file = open(output_file_name, 'w')
    for read in selected_sequences:
        fprintf(output_file, ">%s\n", read)
        fprintf(output_file,"%s\n", selected_sequences[read])

    output_file.close()
def main(argv): 
    (opts, args) = parser.parse_args()
    if check_arguments(opts, args):
       print usage
       sys.exit(0)

    input_folder = opts.input_folder
    output_file = opts.output_file

    filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa');
    cogSeqMatchesPATTERN = re.compile(r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa');
    list= []
    for file in  listdir(input_folder):
      if filePATTERN.match(file):
         hits =  cogSeqMatchesPATTERN.search( file) 
         if hits:
             list.append( (hits.group(1), hits.group(2)) )
         

    try:
        outputfile  = open(output_file, 'w')
    except:
        print "Cannot open file to MLTreeMap hits"
        sys.exit(0)




    fprintf(outputfile, "Sequences\tCOG\n")
    for seq, cog in list:
        fprintf(outputfile, "%s\t%s\n",seq, cog)

    outputfile.close()
def main(argv):
    (opts, args) = parser.parse_args()
    if check_arguments(opts, args):
        print usage
        sys.exit(0)

    input_folder = opts.input_folder
    output_file = opts.output_file

    filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa')
    cogSeqMatchesPATTERN = re.compile(
        r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa')
    list = []
    for file in listdir(input_folder):
        if filePATTERN.match(file):
            hits = cogSeqMatchesPATTERN.search(file)
            if hits:
                list.append((hits.group(1), hits.group(2)))

    try:
        outputfile = open(output_file, 'w')
    except:
        print "Cannot open file to MLTreeMap hits"
        sys.exit(0)

    fprintf(outputfile, "Sequences\tCOG\n")
    for seq, cog in list:
        fprintf(outputfile, "%s\t%s\n", seq, cog)

    outputfile.close()
예제 #7
0
def write_refscores(refscore_file, refscores, compact_output=False):
    for key, value in refscores.items():
        orfid = key
        if compact_output:
            orfid = ShortenORFId(key)

        fprintf(refscore_file, "%s\t%s\n", orfid, value)
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos,  orfid):
   try:
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]


      output_line= orf_dictionary[contig][candidate_orf_pos]['seqname']

      for field in fields:
        # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
         output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field])

      attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id']
      attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag']
      attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length']
      attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length']
      attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial']
      attributes += ";" + "sourcedb="+candidatedbname
     
      if candidatedbname in results_dictionary:
         attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value'])
         attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec'])
         attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product']
      else:
         attributes += ";" + "annotvalue="+str('0')
         attributes += ";" + "ec="+str('')
         attributes += ";" + "product="+'hypothetical protein'

      output_line += '\t' + attributes
      fprintf(outputgff_file, "%s\n", output_line);
   except:
      eprintf("ERROR : Failure to annotate in contig %s\n", contig)
      #print orf_dictionary[contig]
      print traceback.print_exc(10)
      exit_process()
def write_refscores(refscore_file, refscores, compact_output=False):
    for key, value in refscores.iteritems():
       orfid = key
       if compact_output:
          orfid  = ShortenORFId(key)

       fprintf(refscore_file, "%s\t%s\n",orfid, value)
def process_parsed_blastoutput(dbname, blastoutput, opts, orf_read_counts):
    blastparser = BlastOutputTsvParser(dbname, blastoutput, shortenorfid=False)

    hit_counts = {}
    for data in blastparser:
        #if count%10000==0:
        if isWithinCutoffs(data, opts):

            target = getFunctionName(dbname, data)

            if not target in hit_counts:
                hit_counts[target] = 0

            if data['query'] in orf_read_counts:
                hit_counts[target] += orf_read_counts[data['query']]
            else:
                #print 'query', data['query']
                hit_counts[target] += 1
            #print data
    #for name in hit_counts:
    #   print name, hit_counts[name]

    filename = opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname
    filename_txt = filename + ".read_counts.txt"
    filename_biom = filename + ".read_counts.biom"

    with open(filename_txt, 'w') as fout:
        fprintf(fout, "# Gene\tCounts\n")
        for name in hit_counts:
            fprintf(fout, "%s\t%d\n", name, hit_counts[name])

    runBIOMCommand(filename_txt, filename_biom, biomExec="biom")
    return len(hit_counts)
def  make_sure_map_file_exists(config_settings, dbname, globallogger = None):
    dbmapFile = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt"
    seqFilePath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname
    if not doFilesExist( [dbmapFile ] ):
         eprintf("WARNING: Trying to create database map file for %s\n", dbname)
         if globallogger!= None:
            globallogger.write("WARNING: Trying to create database map file for %s\n" %( dbname) )

         if not doFilesExist( [seqFilePath] ):
            eprintf("ERROR : You do not even have the raw sequence for Database  %s to format!\n", dbname)
            eprintf("      : Make sure you have the file %s\n", seqFilePath)

            if globallogger!= None:
               globallogger.write("ERROR \t You do not even have the raw sequence for Database  %s to format!\n" %( dbname))
               globallogger.write("Make sure you have the file %s\n" %( seqFilePath))

            exit_process()

         mapfile = open(dbmapFile,'w')
         seqFile = open(seqFilePath,'r')
         for line in seqFile:
             if re.match(r'>', line):
                 fprintf(mapfile, "%s\n",line.strip())
         seqFile.close()
         mapfile.close()

    return dbmapFile
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, orf_dictionary, contig,
                             candidate_orf_pos, orfid, compact_output):
    global errorcode
    try:
        fields = [
            'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'
        ]

        output_line = orf_dictionary[contig][candidate_orf_pos]['seqname']

        #if compact_output:
        #output_line = ShortenContigId(output_line)

        for field in fields:
            output_line += "\t" + str(
                orf_dictionary[contig][candidate_orf_pos][field])

        #if compact_output:
        try:
            attributes = "ID=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['id'])
            attributes += ";" + "locus_tag=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['locus_tag'])
        except:
            attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id']
            attributes += ";" + "locus_tag=" + orf_dictionary[contig][
                candidate_orf_pos]['locus_tag']

        attributes += ";" + "contig_length=" + orf_dictionary[contig][
            candidate_orf_pos]['contig_length']
        attributes += ";" + "orf_length=" + orf_dictionary[contig][
            candidate_orf_pos]['orf_length']
        attributes += ";" + "partial=" + orf_dictionary[contig][
            candidate_orf_pos]['partial']
        attributes += ";" + "sourcedb=" + candidatedbname

        if candidatedbname in results_dictionary:
            attributes += ";" + "annotvalue=" + str(
                results_dictionary[candidatedbname][orfid]['value'])
            attributes += ";" + "ec=" + str(
                results_dictionary[candidatedbname][orfid]['ec'])
            attributes += ";" + "product=" + results_dictionary[
                candidatedbname][orfid]['product']
        else:
            attributes += ";" + "annotvalue=" + str('0')
            attributes += ";" + "ec=" + str('')
            attributes += ";" + "product=" + 'hypothetical protein'

        output_line += '\t' + attributes

        if candidatedbname in results_dictionary:
            fprintf(outputgff_file, "%s\n", output_line)
    except:
        eprintf("ERROR : Failure to annotate in contig %s\n", contig)
        #print orf_dictionary[contig]
        print traceback.print_exc(10)
        insert_error(errorcode)
        exit_process()
def process_rRNA_16S_stats(dbname,
                           rRNA_16S_file,
                           orf_read_rpkgs,
                           opts,
                           shortenorfid=False):
    print "Processing rRNA database : ", dbname
    counter_rRNA = {}
    if not doesFileExist(rRNA_16S_file):
        return
    try:
        taxonomy_file = open(rRNA_16S_file, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

    tax_lines = taxonomy_file.readlines()
    similarity_pattern = re.compile("similarity")
    evalue_pattern = re.compile("evalue")
    bitscore_pattern = re.compile("bitscore")
    taxonomy_pattern = re.compile("taxonomy")
    headerScanned = False

    seencounter = {}
    for line in tax_lines:
        if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(
                    line) and bitscore_pattern.search(
                        line) and taxonomy_pattern.search(line):
                headerScanned = True
            continue
        fields = [x.strip() for x in line.split('\t')]
        if len(fields) >= 6:
            if not fields[0] in seencounter:
                seencounter[fields[0]] = 0
            else:
                seencounter[fields[0]] += 1

            _name = fields[0] + "_" + str(seencounter[fields[0]]) + "_rRNA"

            if not fields[6] in counter_rRNA:
                counter_rRNA[fields[6]] = 0.0

            name = ShortenrRNAId(_name)
            if name in orf_read_rpkgs:
                counter_rRNA[fields[6]] += orf_read_rpkgs[name]
            else:
                counter_rRNA[fields[6]] += 0

    taxonomy_file.close()
    with open(
            opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname +
            ".read_rpkgs.txt", 'w') as fout:
        fprintf(fout, "# Gene\tCounts\n")
        for name in counter_rRNA:
            fprintf(fout, "%s\t%0.2f\n", name, counter_rRNA[name])

    return len(counter_rRNA)
예제 #14
0
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
        eprintf("ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        if errorlogger:
            errorlogger.printf(
                "ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        exit_process("ERROR\tPathwayTools executable %s not found!\n" %
                     (options.ptoolsExec))

    # command to build the ePGDB
    command = "%s " % (options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(),
                                  options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
        if False:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pythonCyc.setDebug()  # disable pathway debug statements
            printf("INFO\tExtracting the reaction list from ePGDB " +
                   options.sample_name + "\n")
            resultLines = pythonCyc.getReactionListLines()
            #pythonCyc.stopPathwayTools()
            reaction_list_file = open(options.reactions_list + ".tmp", 'w')
            for line in resultLines:
                fprintf(reaction_list_file, "%s\n", line.strip())
            reaction_list_file.close()
            StopPathwayTools()

    except:
        print traceback.print_exc(10)
        eprintf("ERROR\tFailed to run extract pathways for %s : \n" %
                (options.sample_name))
        eprintf(
            "INFO\tKill any other PathwayTools instance running on the machine and try again"
        )
        if errorlogger:
            errorlogger.write(
                "ERROR\tFailed to run extract pathways for %s : " %
                (options.sample_name))
            errorlogger.write(
                "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
            )
        StopPathwayTools()
def copy_faa_gff_orf_prediction( source_files, target_files) :
      for source, target in zip(source_files, target_files):
         #print source + ' ' + target
         sourcefile = open(source, 'r')
         targetfile = open(target, 'w')
         sourcelines = sourcefile.readlines()
         for line in sourcelines:
            fprintf(targetfile, "%s\n", line.strip())
 
         sourcefile.close()
         targetfile.close()
예제 #16
0
def  make_sure_map_file_exists(dbmapfile):
    if not doFilesExist( [dbmapfile ] ):
         print 'WARNING: ' + 'Creating the database map file'
         fullRefDbName = re.sub(r'-names.txt','',dbmapfile)
         mapfile = open(dbmapfile,'w')
         fullRefDbFile = open(fullRefDbName,'r')
         for line in fullRefDbFile:
             if re.match(r'>', line):
                 fprintf(mapfile, "%s\n",line.strip())
         mapfile.close()
         fullRefDbFile.close()
예제 #17
0
def copy_faa_gff_orf_prediction( source_files, target_files) :
      for source, target in zip(source_files, target_files):
         #print source + ' ' + target
         sourcefile = open(source, 'r')
         targetfile = open(target, 'w')
         sourcelines = sourcefile.readlines()
         for line in sourcelines:
            fprintf(targetfile, "%s\n", line.strip())
 
         sourcefile.close()
         targetfile.close()
def write_new_file(lines, output_file):

    print "Fixing file " + output_file
    try:
        outputfile = open(output_file, 'w')
        pass
    except IOError:
        print "ERROR :Cannot open output file " + output_file

    for line in lines:
        fprintf(outputfile, "%s\n", line)

    outputfile.close()
예제 #19
0
def createMapFile(seqFilePath, dbMapFile):
      """ Creates the dbMapFile from sequence file seqFilePath """
      try:
           mapfile = open(dbMapFile,'w')
           seqFile = open(seqFilePath,'r')
           for line in seqFile:
                 if re.match(r'>', line):
                    fprintf(mapfile, "%s\n",line.strip())
           seqFile.close()
           mapfile.close()
      except:
           return False
      return True
예제 #20
0
def createMapFile(seqFilePath, dbMapFile):
    """ Creates the dbMapFile from sequence file seqFilePath """
    try:
        mapfile = open(dbMapFile, 'w')
        seqFile = open(seqFilePath, 'r')
        for line in seqFile:
            if re.match(r'>', line):
                fprintf(mapfile, "%s\n", line.strip())
        seqFile.close()
        mapfile.close()
    except:
        return False
    return True
def write_new_file(lines, output_file):
    
    print "Fixing file " + output_file 
    try:
       outputfile = open(output_file,'w')
       pass
    except IOError:
         print "ERROR :Cannot open output file "  + output_file
   
    for line in lines:
       fprintf(outputfile, "%s\n", line)

    outputfile.close()
예제 #22
0
    def __addToStatusList(self, server, J, list_file_name, list_to_add_to):
        parentDir = self.base_output_folder + PATHDELIM + J.S + PATHDELIM + 'blast_results' + PATHDELIM + 'grid'
        list_jobs_stats_file = parentDir + PATHDELIM + list_file_name
        try:
            if not doesFileExist(list_jobs_stats_file):
                self.messagelogger.write(
                    "WARNING: Cannot file  \"%s\" for sample \"%s\"!\n" %
                    (list_file_name, J.S))
                self.messagelogger.write(
                    "SUCCESS: Create file  \"%s\" for sample \"%s\"!\n" %
                    (list_file_name, J.S))
                listfile = open(list_jobs_stats_file, 'w')
                listfile.close()
        except:
            self.messagelogger.write(
                "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %
                (list_file_name, J.S))
            print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % (
                list_file_name, J.S)
            sys.exit(1)

        try:
            listfile = open(list_jobs_stats_file, 'a')
            eventTime = int(time.time())
            fprintf(
                listfile, "%s\t%s\t%s\t%s\t%s\t%s\n" %
                (J.S, J.d, J.a, J.m, server, str(eventTime)))
            listfile.close()
        except:
            self.messagelogger.write(
                "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %
                (list_file_name, J.S))
            print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % (
                list_file_name, J.S)
            sys.exit(1)

        if not J.S in list_to_add_to:
            list_to_add_to[J.S] = {}

        if not J.d in list_to_add_to[J.S]:
            list_to_add_to[J.S][J.d] = {}

        if not J.a in list_to_add_to[J.S][J.d]:
            list_to_add_to[J.S][J.d][J.a] = {}

        if not J.m in list_to_add_to[J.S][J.d][J.a]:
            list_to_add_to[J.S][J.d][J.a][J.m] = {}

        list_to_add_to[J.S][J.d][J.a][J.m][server] = eventTime
        return True
예제 #23
0
def  write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag):
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]
      for rRNA in rRNA_dictionary:
          output_line= rRNA_dictionary[rRNA]['id']
          for field in fields:
             output_line += "\t"+ str(rRNA_dictionary[rRNA][field])

          attributes = "ID="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag
          attributes += ";" + "locus_tag="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag
          attributes += ";" + "orf_length=" + str(rRNA_dictionary[rRNA]['orf_length'])
          attributes += ";" + "contig_length=" + str(rRNA_dictionary[rRNA]['contig_length'])
          attributes += ";" + "ec="
          attributes += ";" + "product="+rRNA_dictionary[rRNA]['product']
          output_line += '\t' + attributes
          fprintf(outputgff_file, "%s\n", output_line);
def  write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag):
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]
      for rRNA in rRNA_dictionary:
          output_line= rRNA_dictionary[rRNA]['seqname']
          for field in fields:
             output_line += "\t"+ str(rRNA_dictionary[rRNA][field])

          attributes = "ID="+rRNA_dictionary[rRNA]['seqname'] + tag
          attributes += ";" + "locus_tag="+rRNA_dictionary[rRNA]['seqname'] + tag
          attributes += ";" + "orf_length=" + str(rRNA_dictionary[rRNA]['orf_length'])
          attributes += ";" + "contig_length=" + str(rRNA_dictionary[rRNA]['contig_length'])
          attributes += ";" + "ec="
          attributes += ";" + "product="+rRNA_dictionary[rRNA]['product']
          output_line += '\t' + attributes
          fprintf(outputgff_file, "%s\n", output_line);
예제 #25
0
    def create_blast_splits(self,
                            target_folder,
                            blocks_list_filename,
                            maxSize=500,
                            maxBytes=40000000):
        blockno = 0
        currblocksize = 0
        currblockbyteSize = 0

        fastareader = FastaReader(self.fastaFile)
        # Read sequences from sorted sequence file and write them to block files
        try:
            blocklistfile = open(blocks_list_filename, 'w')
        except:
            print "ERROR:  Cannot open " + blocks_list_filename
            sys.exit(0)

        sample_name = 'split'
        fragments = []
        for name in fastareader:
            fragments.append(fastareader.seqname)
            fragments.append(fastareader.sequence)

            if currblocksize >= maxSize - 1 or currblockbyteSize >= maxBytes:
                #TODO adjust the 000 to match the format
                blockfile = open(
                    target_folder + PATHDELIM + sample_name + '.000' +
                    str(blockno) + '.fasta', 'w')
                fprintf(blockfile, "%s", '\n'.join(fragments))
                fragments = []
                blockfile.close()
                # Add this block name to the blocklistfile
                #TODO adjust the 000 to match the format
                fprintf(blocklistfile, "%s\n",
                        sample_name + ".000" + str(blockno))
                blockno += 1
                currblocksize = 0
                currblockbyteSize = 0
            else:
                currblocksize += 1
                currblockbyteSize += len(fastareader.sequence)

        if fragments:
            #TODO adjust the 000 to match the format
            blockfile = open(
                target_folder + PATHDELIM + sample_name + '.000' +
                str(blockno) + '.fasta', 'w')
            fprintf(blockfile, "%s", '\n'.join(fragments))
            blockfile.close()
            fragments = []
            #TODO adjust the 000 to match the format
            fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno))
            blockno += 1

        #Add this block name to the blocklistfile
        blocklistfile.close()
        currblocksize = 0
        currblockbyteSize = 0
예제 #26
0
def add_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open( blast_table_out,'r')

    refscores = {}
    lines = infile.readlines()
    for line in lines:
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))
          exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
예제 #27
0
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, contig, candidate_orf_pos,
                             orfid, compact_output):
    try:
        fields = [
            'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'
        ]
        _values = ["M", "CDS", "0", "30", "100.0", "+", "0"]
        values = {}
        for key, value in zip(fields, _values):
            values[key] = value

        #if compact_output:
        output_line = contig

        for field in fields:
            # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
            output_line += "\t" + values[field]

        #if compact_output:
        attributes = "ID=" + orfid
        attributes += ";" + "locus_tag=" + orfid

        attributes += ";" + "contig_length=" + str(100)
        attributes += ";" + "orf_length=" + str(30)
        attributes += ";" + "partial=" + "00"
        attributes += ";" + "sourcedb=" + candidatedbname

        attributes += ";" + "annotvalue=" + str(
            results_dictionary[candidatedbname][orfid]['value'])
        attributes += ";" + "ec=" + str(
            results_dictionary[candidatedbname][orfid]['ec'])
        attributes += ";" + "product=" + results_dictionary[candidatedbname][
            orfid]['product']

        output_line += '\t' + attributes

        if candidatedbname in results_dictionary:
            fprintf(outputgff_file, "%s\n", output_line)
    except:
        eprintf("ERROR : Failure to annotate in contig %s\n", contig)
        #print orf_dictionary[contig]
        print traceback.print_exc(10)
        exit_process()
예제 #28
0
    def consolidateSplitResults(self, P, split_results):
        sourceParentDir = self.base_output_folder + PATHDELIM + P[
            0] + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' + PATHDELIM + 'split_results'
        targetParentDir = self.base_output_folder + PATHDELIM + P[
            0] + PATHDELIM + 'blast_results'
        targetFileName = targetParentDir + PATHDELIM + P[0] + '.' + P[
            1] + '.' + self.algorithm + "out"

        try:
            targetfile = open(targetFileName, 'w')
        except:
            self.messagelogger.write(
                "ERROR: Cannot create consolidated search results file %s!\n" %
                (targetFileName))
            sys.exit(0)
        for filename in split_results:
            sourceFileName = sourceParentDir + PATHDELIM + filename
            try:
                sourcefile = open(sourceFileName, 'r')
                resultLines = sourcefile.readlines()
                sourcefile.close()
            except:
                self.messagelogger.write(
                    "ERROR: Cannot create consolidated search results file %s!\n"
                    % (sourceFileName))
                sys.exit(0)

            try:
                for line in resultLines:
                    fprintf(targetfile, "%s", line)
            except:
                self.messagelogger.write(
                    "ERROR: Cannot write result from file %s to the consolidated file!\n"
                    % (sourceFileName))
                sys.exit(0)

        self.messagelogger.write(
            "SUCCESS: Successfully consolidated search results into file %s!\n"
            % (targetFileName))
        targetfile.close()
        """ Now delete the consolidates split_files files """
        for filename in split_results:
            sourceFileName = sourceParentDir + PATHDELIM + filename
            os.remove(sourceFileName)
예제 #29
0
def add_blast_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open(blast_table_out, 'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
        line = line.rstrip()
        fields = line.split('\t')
        if len(fields) != 12:
            print('Error in the blastout file')
            sys.exit(1)
        if fields[0].rstrip() == fields[1].rstrip():
            #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
            refscores[fields[0]] = fields[11]

    for key, value in refscores.items():
        allNames[key] = True
        fprintf(refscore_file, "%s\t%s\n", key, value)

    infile.close()
def add_blast_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open( blast_table_out,'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          print 'Error in the blastout file'
          sys.exit(1)
       if fields[0].rstrip()==fields[1].rstrip():
      #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
          refscores[fields[0]]=fields[11]

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
       eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec)
       if errorlogger:
          errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n",  options.ptoolsExec)
       exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec))


    # command to build the ePGDB
    command = "%s "  %(options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
      if False:
         pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
         pythonCyc.setDebug() # disable pathway debug statements
         printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n")
         resultLines = pythonCyc.getReactionListLines()
         #pythonCyc.stopPathwayTools()
         reaction_list_file = open(options.reactions_list + ".tmp", 'w')
         for line in resultLines:
          fprintf(reaction_list_file,"%s\n",line.strip())
         reaction_list_file.close()
         StopPathwayTools()

    except:
           print traceback.print_exc(10)
           eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name))
           eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again")
           if errorlogger:
               errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name))
               errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
           StopPathwayTools()
예제 #32
0
      def create_blast_splits(self, target_folder, blocks_list_filename, maxSize=500, maxBytes = 40000000):
          blockno = 0 
          currblocksize = 0 
          currblockbyteSize = 0 

          fastareader = FastaReader(self.fastaFile)
          # Read sequences from sorted sequence file and write them to block files
          try:
             blocklistfile = open(blocks_list_filename, 'w')
          except:
             print "ERROR:  Cannot open " + blocks_list_filename
             sys.exit(0)

          sample_name = 'split'  
          fragments = []
          for name in fastareader:
                fragments.append(fastareader.seqname) 
                fragments.append(fastareader.sequence)
     
                if currblocksize >= maxSize -1 or currblockbyteSize >= maxBytes:
                    #TODO adjust the 000 to match the format
                    blockfile = open(target_folder +  PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w')
                    fprintf(blockfile, "%s",'\n'.join(fragments))
                    fragments=[]
                    blockfile.close()
                     # Add this block name to the blocklistfile
                    #TODO adjust the 000 to match the format
                    fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno))
                    blockno += 1
                    currblocksize = 0 
                    currblockbyteSize = 0 
                else: 
                    currblocksize += 1
                    currblockbyteSize += len(fastareader.sequence)
     
     
          
          if fragments:
             #TODO adjust the 000 to match the format
             blockfile = open(target_folder +  PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w')
             fprintf(blockfile, "%s",'\n'.join(fragments))
             blockfile.close()
             fragments = []
             #TODO adjust the 000 to match the format
             fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno))
             blockno += 1
     
          #Add this block name to the blocklistfile
          blocklistfile.close()
          currblocksize = 0 
          currblockbyteSize = 0 
예제 #33
0
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary):

    tag = re.sub(r'[.]gbk', '', input_gbk)
    tag = re.sub(r'.*/', '', tag)

    output_gbk_file = open(output_gbk, 'w')
    serializer = genbank.GenBankRecordSerializer()
    with open(input_gbk, 'r') as genbank_file:
        out_list = []
        count = 0
        for record in genbank.GenBankRecordParser(genbank_file.read()):
            count += 1

            record.locus = tag + str(count)
            if count % 1000 == 0:
                print('Count = ' + str(count))

            if headers and 'REFERENCES' in headers:
                record.references_ = headers['REFERENCES']

            i = 0
            for feature in record.features:
                if feature.type == "CDS":
                    if feature.locus_tag in gff_dictionary:
                        record.features[i].product = 'aaaaa ' + gff_dictionary[
                            feature.locus_tag]['product']
                i += 1

            #record.locus = "hello"

            out_list.append(serializer.serialize(record))
            if count % 1000 == 0:
                output_str = '\n'.join(out_list)
                out_list = []
                fprintf(output_gbk_file, '%s\n', output_str)

        output_str = '\n'.join(out_list)
        fprintf(output_gbk_file, '%s\n', output_str)

        output_gbk_file.close()
예제 #34
0
      def __addToStatusList(self, server, J, list_file_name, list_to_add_to):
         parentDir =  self.base_output_folder + PATHDELIM + J.S + PATHDELIM + 'blast_results' + PATHDELIM + 'grid'
         list_jobs_stats_file=parentDir + PATHDELIM + list_file_name
         try:
            if not doesFileExist(list_jobs_stats_file):
                self.messagelogger.write("WARNING: Cannot file  \"%s\" for sample \"%s\"!\n" %(list_file_name, J.S))
                self.messagelogger.write("SUCCESS: Create file  \"%s\" for sample \"%s\"!\n" %(list_file_name, J.S))
                listfile  = open(list_jobs_stats_file, 'w')
                listfile.close()
         except:
            self.messagelogger.write("ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S))
            print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S)
            sys.exit(1)

         try:
            listfile  = open(list_jobs_stats_file, 'a')
            eventTime = int(time.time())
            fprintf(listfile, "%s\t%s\t%s\t%s\t%s\t%s\n" %(J.S, J.d, J.a, J.m, server, str(eventTime)) )
            listfile.close()
         except:
            self.messagelogger.write("ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S))
            print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S)
            sys.exit(1)


         
         if not J.S in list_to_add_to:
            list_to_add_to[J.S] = {}

         if not J.d in list_to_add_to[J.S]:
            list_to_add_to[J.S][J.d] = {}
        
         if not J.a in list_to_add_to[J.S][J.d]:
            list_to_add_to[J.S][J.d][J.a] = {}

         if not J.m in list_to_add_to[J.S][J.d][J.a]:
            list_to_add_to[J.S][J.d][J.a][J.m] = {}

         list_to_add_to[J.S][J.d][J.a][J.m][server] = eventTime
         return  True
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary):
   
  tag = re.sub(r'[.]gbk','', input_gbk)
  tag = re.sub(r'.*/','', tag)

  output_gbk_file = open(output_gbk,'w') 
  serializer = genbank.GenBankRecordSerializer()
  with open(input_gbk, 'r') as genbank_file:
     out_list=[]
     count = 0
     for record in genbank.GenBankRecordParser(genbank_file.read()):
        count+=1
        
        record.locus = tag +  str(count)
        if count%1000==0:
           print 'Count = ' + str(count)
        
        if headers and 'REFERENCES' in headers:
           record.references_ = headers['REFERENCES']

        i = 0
        for feature in record.features:
           if feature.type =="CDS":
             if  feature.locus_tag in gff_dictionary:
                record.features[i].product= 'aaaaa ' + gff_dictionary[feature.locus_tag]['product']
           i+=1

        #record.locus = "hello"

        out_list.append(serializer.serialize(record))
        if count%1000 == 0:
           output_str = '\n'.join(out_list)
           out_list=[]
           fprintf(output_gbk_file,'%s\n',output_str)

     output_str = '\n'.join(out_list)
     fprintf(output_gbk_file,'%s\n',output_str)

     output_gbk_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary,  contig, candidate_orf_pos,  orfid, compact_output):
   try:
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]
      _values = [   "M", "CDS",  "0", "30", "100.0", "+", "0" ]    
      values = {}
      for key, value in zip(fields, _values):
         values[key] = value


      #if compact_output:
      output_line = contig 


      for field in fields:
        # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
         output_line += "\t"+ values[field]

      #if compact_output:
      attributes = "ID="+orfid
      attributes += ";" + "locus_tag="+orfid

      attributes += ";" + "contig_length="+str(100)
      attributes += ";" + "orf_length="+str(30)
      attributes += ";" + "partial="+"00"
      attributes += ";" + "sourcedb="+candidatedbname
     
      attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value'])
      attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec'])
      attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product']

      output_line += '\t' + attributes

      if candidatedbname in results_dictionary:
         fprintf(outputgff_file, "%s\n", output_line);
   except:
      eprintf("ERROR : Failure to annotate in contig %s\n", contig)
      #print orf_dictionary[contig]
      print traceback.print_exc(10)
      exit_process()
def print_counts_at_level(hierarchical_map,
                          field_to_description,
                          depth,
                          level,
                          outputfile,
                          printKey=True,
                          header=None):

    if type(hierarchical_map) is type(0):
        return hierarchical_map
    if header:
        fprintf(outputfile, "%s\n", header)

    count = 0
    for key in hierarchical_map:
        tempcount = print_counts_at_level(hierarchical_map[key],
                                          field_to_description,
                                          depth + 1,
                                          level,
                                          outputfile,
                                          printKey=printKey)
        if depth == level:
            if key in field_to_description:
                if printKey:
                    fprintf(
                        outputfile, "%s\n", key + '\t' +
                        field_to_description[key] + '\t' + str(tempcount))
                else:
                    fprintf(outputfile, "%s\n",
                            field_to_description[key] + '\t' + str(tempcount))
            else:
                if printKey:
                    fprintf(outputfile, "%s\n",
                            key + '\t' + ' ' + '\t' + str(tempcount))
                else:
                    fprintf(outputfile, "%s\n", key + '\t' + str(tempcount))
        count += tempcount
    return count
def print_counts_at_level(hierarchical_map, field_to_description,  depth, level, outputfile, printKey=True, header=None):

    if type(hierarchical_map) is type(0):
       return hierarchical_map
    if header:
       fprintf(outputfile, "%s\n",header )

    count = 0
    for key in hierarchical_map:
       tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile, printKey=printKey)
       if depth==level:
          if key in field_to_description:
              if printKey:
                 fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' +  str(tempcount) )
              else:
                 fprintf(outputfile, "%s\n",  field_to_description[key] + '\t' +  str(tempcount) )
          else:
              if printKey:
                 fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount))
              else:
                 fprintf(outputfile, "%s\n", key +  '\t' + str(tempcount))
       count+=tempcount
    return count
예제 #39
0
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames):
    commentPATTERN = re.compile(r'^#')

    infile = open(blast_table_out, 'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
        if commentPATTERN.match(line):
            continue
        line = line.rstrip()
        fields = line.split('\t')
        if len(fields) != 12:
            print('Error in the blastout file')
            sys.exit(1)
        if fields[6].rstrip() == fields[1].rstrip():
            #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
            refscores[fields[1]] = fields[0]

    for key, value in refscores.items():
        allNames[key] = True
        fprintf(refscore_file, "%s\t%s\n", key, value)

    infile.close()
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames):
    commentPATTERN = re.compile(r'^#')

    infile = open( blast_table_out,'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
       if commentPATTERN.match(line):
          continue
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          print 'Error in the blastout file'
          sys.exit(1)
       if fields[6].rstrip()==fields[1].rstrip():
      #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
          refscores[fields[1]]=fields[0]

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def create_gff_faa(tempfile, gfffile, faafile):
    patt = re.compile(r'>(.*)_(\d+)_(\d+)_([+-])')
    idpatt = re.compile(r'.*_(\d+_\d+)')

    with open(gfffile, 'w') as gffout:
      with open(faafile, 'w') as faaout:
        fastareader = FastaReader(tempfile)
        for fasta in fastareader:
          res=patt.search(fasta.name)
          if res:
             #nameprint(res.group(1),res.group(2), res.group(3), res.group(4))
             orfname=res.group(1)
             start=res.group(2)
             end=res.group(3)
             strand=res.group(4)
             res=idpatt.search(orfname)
             id=''
             if res:
                id=res.group(1)
             attr = "ID=" + id + ";partial=00"
          fields=[orfname, 'FGS+', 'CDS', start, end, '0', strand, "0", attr]

          fprintf(faaout,'>' + orfname + "\n" + fasta.sequence+"\n")
          fprintf(gffout,'\t'.join(fields) +'\n')
def make_sure_map_file_exists(config_settings, dbname, globallogger=None):
    dbmapFile = config_settings[
        'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt"
    seqFilePath = config_settings[
        'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname
    if not doFilesExist([dbmapFile]):
        eprintf("WARNING: Trying to create database map file for %s\n", dbname)
        if globallogger != None:
            globallogger.write(
                "WARNING: Trying to create database map file for %s\n" %
                (dbname))

        if not doFilesExist([seqFilePath]):
            eprintf(
                "ERROR : You do not even have the raw sequence for Database  %s to format!\n",
                dbname)
            eprintf("      : Make sure you have the file %s\n", seqFilePath)

            if globallogger != None:
                globallogger.write(
                    "ERROR \t You do not even have the raw sequence for Database  %s to format!\n"
                    % (dbname))
                globallogger.write("Make sure you have the file %s\n" %
                                   (seqFilePath))

            exit_process()

        mapfile = open(dbmapFile, 'w')
        seqFile = open(seqFilePath, 'r')
        for line in seqFile:
            if re.match(r'>', line):
                fprintf(mapfile, "%s\n", line.strip())
        seqFile.close()
        mapfile.close()

    return dbmapFile
예제 #43
0
def create_gff_faa(tempfile, gfffile, faafile):
    patt = re.compile(r'>(.*)_(\d+)_(\d+)_([+-])')
    idpatt = re.compile(r'.*_(\d+_\d+)')

    with open(gfffile, 'w') as gffout:
      with open(faafile, 'w') as faaout:
        fastareader = FastaReader(tempfile)
        for fasta in fastareader:
          res=patt.search(fasta.name)
          if res:
             #nameprint(res.group(1),res.group(2), res.group(3), res.group(4))
             orfname=res.group(1)
             start=res.group(2)
             end=res.group(3)
             strand=res.group(4)
             res=idpatt.search(orfname)
             id=''
             if res:
                id=res.group(1)
             attr = "ID=" + id + ";partial=00"
          fields=[orfname, 'FGS+', 'CDS', start, end, '0', strand, "0", attr]

          fprintf(faaout,'>' + orfname + "\n" + fasta.sequence+"\n")
          fprintf(gffout,'\t'.join(fields) +'\n')
def writeParsedLines(fieldmapHeaderline, parsedLines, list, names, outputfilename):
    try:
      outputfile = open(outputfilename, 'w')
    except OSError:
      print "ERROR: Cannot create sequence file : " + outputfilename
      sys.exit(0)

    outputStr=fieldmapHeaderline + "\n"
    fprintf(outputfile, "%s", outputStr)

    outputStr=""
    i = 0
    for item in list:
       outputStr += parsedLines[item[0]]+'\n'
       if i% 1000==0 and i > 0:
          fprintf(outputfile, "%s", outputStr)
          outputStr=""
       i += 1

    if len(outputStr) > 0:
      fprintf(outputfile, "%s", outputStr)

    outputfile.close()
def writeParsedLines(fieldmapHeaderline, parsedLines, list, names, outputfilename):
    try:
      outputfile = open(outputfilename, 'w')
    except OSError:
      print "ERROR: Cannot create sequence file : " + outputfilename
      sys.exit(0)

    outputStr=fieldmapHeaderline + "\n"
    fprintf(outputfile, "%s", outputStr)

    outputStr=""
    i = 0
    for item in list:
       outputStr += parsedLines[item[0]]+'\n'
       if i% 1000==0 and i > 0:
          fprintf(outputfile, "%s", outputStr)
          outputStr=""
       i += 1

    if len(outputStr) > 0:
      fprintf(outputfile, "%s", outputStr)

    outputfile.close()
def process_blastoutput(dbname,
                        blastoutput,
                        mapfile,
                        refscore_file,
                        opts,
                        errorlogger=None):

    blastparser = BlastOutputParser(dbname,
                                    blastoutput,
                                    mapfile,
                                    refscore_file,
                                    opts,
                                    errorlogger=errorlogger)

    blastparser.setMaxErrorsLimit(100)
    blastparser.setErrorAndWarningLogger(errorlogger)
    blastparser.setSTEP_NAME('PARSE BLAST')

    fields = [
        'target', 'q_length', 'bitscore', 'bsr', 'expect', 'aln_length',
        'identity', 'ec'
    ]
    if opts.taxonomy:
        fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = opts.parsed_output

    # temporary file is used to deal with incomplete processing of the file
    output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp"
    try:
        outputfile = open(output_blastoutput_parsed_tmp, 'w')
    except:
        if errorlogger:
            errorlogger.write(
                "PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n"
                % (soutput_blastoutput_parsed_tmp, dbname))
        exit_process(
            "PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n"
            % (soutput_blastoutput_parsed_tmp, dbname))

    # write the headers out
    fprintf(outputfile, "#%s", 'query')
    for field in fields:
        fprintf(outputfile, "\t%s", field)
    fprintf(outputfile, "\n")

    pattern = re.compile(r'' + "(\d+_\d+)$")

    count = 0
    uniques = {}
    for data in blastparser:
        if not data:
            continue
        try:
            fprintf(outputfile, "%s", data['query'])

            result = pattern.search(data['query'])
            if result:
                name = result.group(1)
                uniques[name] = True
        except:
            print 'data is : ', data, '\n'
            return count, len(uniques)

        for field in fields:
            fprintf(outputfile, "\t%s", data[field])
        fprintf(outputfile, "\n")
        count += 1

    outputfile.close()
    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)

    return count, len(uniques)
def main(argv, errorlogger = None, runstatslogger = None): 
    global parser
    (opts, args) = parser.parse_args(argv)

    if not valid_arguments(opts, args):
       print usage
       sys.exit(0)

    min_length = 0
    #inputfile = open(opts.input_fasta,'r')
    outfile = open(opts.output_fasta, 'w') 
    outfilefna = open(opts.output_fna, 'w') 
    outfilefaa = open(opts.output_faa, 'w') 
    outfilegff = open(opts.output_gff, 'w') 


    logfile = open(opts.log_file, 'w') 
    lengthsfile = open(opts.lengths_file, 'w') 
     
    if opts.map_file:
       mapfile = open(opts.map_file, 'w') 
    else:
       mapfile = None

    
    sample_name = opts.input_fasta;
    sample_name = re.sub(r'^.*/','',sample_name, re.I)
    sample_name = re.sub(r'^.*\\','',sample_name, re.I)
    sample_name = re.sub(r'\.fasta$','',sample_name, re.I)
    sample_name = re.sub(r'\.fna$','',sample_name, re.I)
    sample_name = re.sub(r'\.faa$','',sample_name, re.I)
    sample_name = re.sub(r'\.fas$','',sample_name, re.I)
    sample_name = re.sub(r'\.fa$','',sample_name, re.I)

    BEFORE = 'BEFORE'
    AFTER = 'AFTER'
    NUMSEQ = "#INFO\tNumber of sequences :"   
    NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences"
    AVG_LENGTH= "@INFO\tAverage length of sequences:"
    MIN_LENGTH= "@INFO\tMinimum length of sequences:"
    MAX_LENGTH= "@INFO\tMaximum length of sequences:" 

    _MAX = 1000000000000
    stats = { 
              MIN_LENGTH: { 'BEFORE':_MAX, 'AFTER':_MAX },  
              MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 },  
              NUMSEQ : { 'BEFORE' :0, 'AFTER':0},   
              NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 },
              AVG_LENGTH : { 'BEFORE':0, 'AFTER':0 },
            }  

    length_distribution = {}
    length_cumulative_distribution = {}

    for i in range(0,31):
        length_distribution[i]= 0
        length_cumulative_distribution[i]= 0

    seq_count = 0
    allNames= dict()
    outputStr = ""
    outputLines = []
    fastareader= FastaReader(opts.input_fasta)

    """ process one fasta sequence at a time """
    lengths_str=""
    for record in fastareader:
        seqname = record.name
        seq = record.sequence
        length = len(seq) 
        
        index = int(len(seq) / 50);
        if index >= 30:
            index = 30

        length_distribution[index] += 1
        if length < stats[MIN_LENGTH][BEFORE] :
            stats[MIN_LENGTH][BEFORE] = length

        if length > stats[MAX_LENGTH][BEFORE] : 
            stats[MAX_LENGTH][BEFORE] = length

        if length < MIN_LENGTH:
            stats[NUMSEQ_SHORTER][BEFORE] += 1

        stats[AVG_LENGTH][BEFORE]  =  stats[AVG_LENGTH][BEFORE] + length

        seqvalue = filter_sequence(seq)
    
        stats[NUMSEQ][BEFORE] += 1
        
        seqlen = len(seqvalue)
        if seqlen>= min_length :

           if len(lengths_str) > 100: 
              fprintf(lengthsfile,"%s\n",lengths_str);
              lengths_str = str(seqlen)
           else:
              lengths_str += '\t' + str(seqlen)

           stats[NUMSEQ][AFTER] += 1
           stats[AVG_LENGTH][AFTER]  =  stats[AVG_LENGTH][AFTER] + seqlen
           if mapfile==None:
              fprintf(outfile, "%s\n", seqname)
           else:
               contigID =  sample_name + '_' + str(seq_count) 
               orfID =  sample_name + '_' + str(seq_count) + "_0" 

               fprintf(outfile, ">%s\n",  contigID )
               fprintf(outfilefna, ">%s\n",  orfID )
               fprintf(outfilefaa, ">%s\n",  orfID )

               gffString =  sample_name + '_' + str(seq_count)
               gffString +=  "\t" + "AMINO_ACID_SEQ"
               gffString +=  "\t" + "CDS"
               gffString +=  "\t" + "0"
               gffString +=  "\t" + str(3*seqlen)
               gffString +=  "\t" + "0"
               gffString +=  "\t" + "+"
               gffString +=  "\t" + "0"
               gffString +=  "\t" + "ID=" + orfID + ";" 
               gffString +=  "locus_tag=" + orfID + ";" 
               gffString +=  "partial=00;" 
               gffString +=  "orf_length="+ str(seqlen)+";" 
               gffString +=  "contig_length="+ str(3*seqlen)

               fprintf(outfilegff, "%s\n", gffString)

               key = re.sub(r'^>','',seqname)
               fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen))
               seq_count += 1

           fprintf(outfile, "%s\n","DUMMY CONTIGS FOR AMINO ACID SEQUENCES")
           fprintf(outfilefna, "%s\n","DUMMY ORFS FOR AMINO ACID SEQUENCES")
           fprintf(outfilefaa, "%s\n",seqvalue)

           if  seqlen < stats[MIN_LENGTH][AFTER] :
               stats[MIN_LENGTH][AFTER] = seqlen
             
           if  seqlen > stats[MAX_LENGTH][AFTER] :
               stats[MAX_LENGTH][AFTER] = seqlen

    print 'done'
    fprintf(lengthsfile,"%s\n",lengths_str);

    if stats[NUMSEQ][BEFORE] > 0 :
      stats[AVG_LENGTH][BEFORE]  = stats[AVG_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE]
    else:
      stats[AVG_LENGTH][BEFORE]  = 0
    if stats[NUMSEQ][AFTER] > 0 :
       stats[AVG_LENGTH][AFTER]  = stats[AVG_LENGTH][AFTER]/stats[NUMSEQ][AFTER]
    else :
       stats[AVG_LENGTH][AFTER]  = 0

    lengthsfile.close()
    outfile.close()
    outfilefna.close()
    outfilefaa.close()
    outfilegff.close()

    #inputfile.close()
    if mapfile != None:
       mapfile.close()

    """ min length """
    if stats[MIN_LENGTH][BEFORE] == _MAX:
       stats[MIN_LENGTH][BEFORE] = 0
    if stats[MIN_LENGTH][AFTER] == _MAX:
       stats[MIN_LENGTH][AFTER] = 0

    fprintf(logfile, "@INFO\tBEFORE\tAFTER\n");
    fprintf(logfile, "%s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER]));
    fprintf(logfile, "%s\n", NUMSEQ_SHORTER   + '\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER]))
    fprintf(logfile, "%s\n", AVG_LENGTH +'\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t'+ str(stats[AVG_LENGTH][AFTER]))
    fprintf(logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER]))
    fprintf(logfile, "%s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' +  str(stats[MAX_LENGTH][AFTER]))
    fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n");
#    fprintf(logfile, "#   ---\t-----\t--------\t---------\t----------\n");

    i  = 30
    length_cumulative_distribution[i] = length_cumulative_distribution[i];
    i  -= 1
    while i >= 0:
       length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i];
       i -= 1

    for i in range(0,31):
       fprintf(logfile, "   %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\
                str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) )

    logfile.close()


    seqtype='amino'
    """priority is used to sort the output to print in the right order"""
    priority = 2000

    if runstatslogger != None:
         runstatslogger.write("%s\tSequences BEFORE Filtering (%s)\t%s\n" %(str(priority), seqtype,  str(stats[NUMSEQ][BEFORE])) )
         runstatslogger.write("%s\tmin length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) )
         runstatslogger.write("%s\tavg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE]))))
         runstatslogger.write("%s\tmax length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) )
         runstatslogger.write("%s\ttot length\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE]))))
         runstatslogger.write("%s\tSequences AFTER Filtering (%s)\t%s\n" %(str(priority + 5), seqtype, str(stats[NUMSEQ][AFTER])))
         runstatslogger.write("%s\tmin length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) )
         runstatslogger.write("%s\tavg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER]))))
         runstatslogger.write("%s\tmax length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) )
         runstatslogger.write("%s\ttot length\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) ))
def create_annotation(dbname_weight, results_dictionary, input_gff,  rRNA_16S_stats_files, tRNA_stats_files,  output_gff, output_comparative_annotation, contig_lengths, compact_output = False):
    orf_dictionary={}
#    process_gff_file(input_gff, orf_dictionary)
    gffreader = GffFileParser(input_gff)

    output_gff_tmp = output_gff + ".tmp"
    outputgff_file = open( output_gff_tmp, 'w')
    output_comp_annot_file1 = open( output_comparative_annotation + '.1.txt', 'w')
    output_comp_annot_file2 = open( output_comparative_annotation + '.2.txt', 'w')

    output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue'
    fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str)

    output_comp_annot_file2_Str = 'orf_id'
    dbnames = dbname_weight.keys()
    for dbname in dbnames:
         weight = dbname_weight[dbname]
         output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(dbname)
    fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str)
       

#    gffreader = GffReader(input_gff)
   # for dbname in dbnames:
   #   print dbname, len(results_dictionary[dbname].keys())
   #   print results_dictionary[dbname].keys()
    i = 0
    for contig in  gffreader:
       count = 0
       for orf in  gffreader.orf_dictionary[contig]:
         value = 0.0001
         success =False
         output_comp_annot_file1_Str = ''
         output_comp_annot_file2_Str = ''
         for dbname in dbnames:
            weight = dbname_weight[dbname]
            value = 0
            orf_id = orf['id']
            if orf_id in results_dictionary[dbname]:
                if value < results_dictionary[dbname][orf_id]['value']:
                    value = results_dictionary[dbname][orf_id]['value']
                    candidatedbname=dbname
                    success =True
                    candidate_orf_pos = count 

                    if output_comp_annot_file1_Str:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                    else:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, dbname,\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))


                    if output_comp_annot_file2_Str:
                        output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                    else:
                        output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id, 
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

            else: 
                if not output_comp_annot_file1_Str:
                   output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, '','','','')

                if output_comp_annot_file2_Str:
                   output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format('', '','')
                else:
                   output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id, '','','','')

         
         if success:  # there was a database hit
            fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str)
            fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str)
            write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos,  orf_id, compact_output=compact_output) 
         else:   # if it was not  a hit then it is a hypothetical protein
            #print gffreader.orf_dictionary
            write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf_id, compact_output = compact_output) 
         
         count +=1  #move to the next orf

       #del orf_dictionary[contig]   
    output_comp_annot_file1.close()
    output_comp_annot_file2.close()

    # now deal with the rRNA sequences  if there is rRNA stats file
    if len(rRNA_16S_stats_files) > 0 and contig_lengths :
       rRNA_16S_dictionary={} 
       for rRNA_16S_stats_file in rRNA_16S_stats_files:
          process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary)

       rRNA_dictionary = {}
       add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths) 
       write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA')

    # now deal with the tRNA sequences  if there is tRNA stats file
    if len(tRNA_stats_files) > 0 and contig_lengths:
       tRNA_dictionary={} 
       for tRNA_stats_file in tRNA_stats_files:
          process_tRNA_stats(tRNA_stats_file, tRNA_dictionary)

       tRNA_gff_dictionary = {}
       add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths) 
       write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA')
       #print tRNA_dictionary


    outputgff_file.close()     
    rename(output_gff_tmp, output_gff)
예제 #49
0
def formatDB(tools,
             db,
             refdbspath,
             seqType,
             dbType,
             algorithm,
             configs,
             logger=None):
    """ Formats the sequences for the specified algorithm """
    EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs[
        'EXECUTABLES_DIR']
    formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH'][
        'exec']['BLAST']['FORMATDB_EXECUTABLE']
    if seqType == 'nucl':
        if algorithm == 'LAST':
            formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[
                'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE']
        if algorithm == 'BLAST':
            formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[
                'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE']

    if seqType == 'prot':
        if algorithm == 'LAST':
            formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[
                'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE']
        if algorithm == 'BLAST':
            formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[
                'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE']

    formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db
    raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db

    _temp_formatted_db = formatted_db + "__temp__"
    """ format with 4GB file size """
    cmd = ""
    if algorithm == 'BLAST':
        cmd = '%s -dbtype %s -max_file_sz 4294967296  -in %s -out %s' % (
            formatdb_executable, seqType, raw_sequence_file,
            _temp_formatted_db)
        #cmd='%s -dbtype %s -max_file_sz 20267296  -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db)

    if algorithm == 'LAST':
        # dirname = os.path.dirname(raw_sequence_file)
        cmd = ""
        if seqType == "prot":
            cmd = '%s -s 4000M -p -c %s  %s' % (
                formatdb_executable, _temp_formatted_db, raw_sequence_file)
        if seqType == "nucl":
            cmd = '%s -s 4000M -c %s  %s' % (
                formatdb_executable, _temp_formatted_db, raw_sequence_file)

        eprintf("INFO\tCommand to format \"%s\"\n", cmd)
        logger.printf("INFO\tCommand to format \"%s\"\n", cmd)

    result = getstatusoutput(cmd)
    temp_fileList = glob(_temp_formatted_db + '*')

    _formatted_db_pal = _temp_formatted_db + ".pal"
    if algorithm == 'BLAST' and path.exists(_formatted_db_pal):
        try:

            formatted_db_pal = formatted_db + ".pal"
            if seqType == "nucl":
                formatted_db_pal = formatted_db + ".nal"

            _openpal = open(_formatted_db_pal, 'r')
            openpal = open(formatted_db_pal, 'w')
            lines = _openpal.readlines()
            tempPATT = re.compile(r'__temp__')
            for line in lines:
                _result = tempPATT.search(line)
                modline = line.strip()
                if _result:
                    modline = re.sub('__temp__', '', modline)
                fprintf(openpal, "%s\n", modline)
            openpal.close()
            _openpal.close()
            remove(_formatted_db_pal)
        except:
            return False

    try:
        temp_fileList = glob(_temp_formatted_db + '*')
        for tempFile in temp_fileList:
            file = re.sub('__temp__', '', tempFile)
            rename(tempFile, file)
    except:
        return False

    if result[0] == 0:
        eprintf("INFO\tFormatted database %s successfully for %s\n",
                sQuote(db), sQuote(algorithm))
        logger.printf("INFO\tFormatted database %s successfully for %s\n",
                      sQuote(db), sQuote(algorithm))
        return True
    else:
        eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db),
                sQuote(algorithm))
        eprintf("INFO\tReason for failure %s\n", result[1])
        logger.printf("INFO\tReason for failure %s\n", result[1])
        logger.printf("INFO\tFailed to Format database %s for %s\n",
                      sQuote(db), sQuote(algorithm))
        return False
def create_annotation(results_dictionary, dbname,  annotated_gff,  output_dir, Taxons, orfsPicked, orfToContig, lca):

    meganTree = None
    #lca.set_results_dictionary(results_dictionary)
    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)
    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a')

    count = 0
    for contig in gffreader:
        #    shortORFId = getShortORFId(orf['id'])
        for orf in gffreader.orf_dictionary[contig]:
            shortORFId = getShortORFId(orf['id'])
            if shortORFId not in orfsPicked:
                continue

            orfToContig[shortORFId] = contig

            taxonomy = None

            #_results = re.search(r'refseq', opts_global.database_name, re.I)
            if shortORFId in Taxons:
                taxonomy1=Taxons[shortORFId]
                taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True)
                # print taxonomy_id
                preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id)
                if preferred_taxonomy:
                    taxonomy = preferred_taxonomy
                else:
                    taxonomy = Taxons[shortORFId]
            else:
                taxonomy = 'root'
            # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip()
            product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip()
            # if "partial" in orf['product']:
            #     print orf['product'].strip()
            #     print product
            fprintf(output_table_file, "%s", orf['id'])
            fprintf(output_table_file, "\t%s", orf['orf_length'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", orf['seqname'])
            fprintf(output_table_file, "\t%s", orf['contig_length'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            # fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", product)

            # adding taxons to the megan tree
            #if meganTree and taxonomy != '':
            #    meganTree.insertTaxon(taxonomy)
            #print meganTree.getChildToParentMap()

    output_table_file.close()
예제 #51
0
 def add_job_to_list_jobs(self, J, listfile):
     fprintf(listfile,
             "%s\t%s\t%s\t%s\n" % (J.S, J.d, J.a, self.getAlgorithm(J.S)))
     return True
def main(argv, errorlogger = None, runstatslogger = None): 
    global parser
    global errorcode

    (opts, args) = parser.parse_args(argv)

    if not valid_arguments(opts, args):
       print usage
       sys.exit(0)

    min_length = opts.min_length
    outfile = open(opts.output_fasta + '.tmp', 'w') 
    logfile = open(opts.log_file, 'w') 
    lengthsfile = open(opts.lengths_file + '.tmp', 'w') 
     

    if opts.map_file:
       mapfile = open(opts.map_file, 'w') 
    else:
       mapfile = None

    if opts.seqtype=='nucleotide':
        errorcode = 1
    else:
        errorcode = 3
    
    sample_name = opts.input_fasta;
    sample_name = re.sub(r'^.*/','',sample_name, re.I)
    sample_name = re.sub(r'^.*\\','',sample_name, re.I)
    sample_name = re.sub(r'\.fasta$','',sample_name, re.I)
    sample_name = re.sub(r'\.fna$','',sample_name, re.I)
    sample_name = re.sub(r'\.faa$','',sample_name, re.I)
    sample_name = re.sub(r'\.fas$','',sample_name, re.I)
    sample_name = re.sub(r'\.fa$','',sample_name, re.I)

    BEFORE = 'BEFORE'
    AFTER = 'AFTER'
    NUMSEQ = "#INFO\tNumber of sequences :"   
    NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences"
    AVG_LENGTH= "@INFO\tAverage length of sequences:"
    MIN_LENGTH= "@INFO\tMinimum length of sequences:"
    MAX_LENGTH= "@INFO\tMaximum length of sequences:" 

    _MAX = 1000000000000
    stats = { 
              MIN_LENGTH: { 'BEFORE':_MAX, 'AFTER':_MAX },  
              MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 },  
              NUMSEQ : { 'BEFORE' :0, 'AFTER':0},   
              NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 },
              AVG_LENGTH : { 'BEFORE':0, 'AFTER':0 },
            }  

    length_distribution = {}
    length_cumulative_distribution = {}

    for i in range(0,31):
        length_distribution[i]= 0
        length_cumulative_distribution[i]= 0

    seq_count = 0
    allNames= dict()
    outputStr = ""
    outputLines = []
    fastareader= FastaReader(opts.input_fasta)

    """ process one fasta sequence at a time """
    lengths_str=""
    for record in fastareader:
        seqname = record.name
        seq = record.sequence
        length = len(seq)
        
        index = int(len(seq) / 50);
        if index >= 30:
            index = 30

        length_distribution[index] += 1
        if length < stats[MIN_LENGTH][BEFORE] :
            stats[MIN_LENGTH][BEFORE] = length

        if length > stats[MAX_LENGTH][BEFORE] : 
            stats[MAX_LENGTH][BEFORE] = length

        if length < MIN_LENGTH:
            stats[NUMSEQ_SHORTER][BEFORE] += 1

        stats[AVG_LENGTH][BEFORE]  =  stats[AVG_LENGTH][BEFORE] + length

        #stopped the filtering process seqvalue = filter_sequence(seq)
        seqvalue = seq.upper()
    
        stats[NUMSEQ][BEFORE] += 1
        
        seqlen = len(seqvalue)
        if seqlen>= min_length :

           if len(lengths_str) > 100: 
              fprintf(lengthsfile,"%s\n",lengths_str);
              lengths_str = str(seqlen)
           else:
              lengths_str += '\t' + str(seqlen)

           stats[NUMSEQ][AFTER] += 1
           stats[AVG_LENGTH][AFTER]  =  stats[AVG_LENGTH][AFTER] + seqlen
           if mapfile==None:
              fprintf(outfile, "%s\n", seqname)
           else:
               fprintf(outfile, ">%s\n",  sample_name + '_' + str(seq_count) )
               key = re.sub(r'^>','',seqname)
               fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen))
               seq_count += 1

           fprintf(outfile, "%s\n",seqvalue)

           if  seqlen < stats[MIN_LENGTH][AFTER] :
               stats[MIN_LENGTH][AFTER] = seqlen
             
           if  seqlen > stats[MAX_LENGTH][AFTER] :
               stats[MAX_LENGTH][AFTER] = seqlen

    fprintf(lengthsfile,"%s\n",lengths_str);
    
    if stats[NUMSEQ][BEFORE] > 0 :
      stats[AVG_LENGTH][BEFORE]  = stats[AVG_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE]
    else:
      stats[AVG_LENGTH][BEFORE]  = 0
    if stats[NUMSEQ][AFTER] > 0 :
       stats[AVG_LENGTH][AFTER]  = stats[AVG_LENGTH][AFTER]/stats[NUMSEQ][AFTER]
    else :
       stats[AVG_LENGTH][AFTER]  = 0

    lengthsfile.close()
    outfile.close()

    rename(opts.output_fasta + ".tmp", opts.output_fasta)
    rename(opts.lengths_file + ".tmp", opts.lengths_file)

    #inputfile.close()
    if mapfile != None:
       mapfile.close()

    """ min length """
    if stats[MIN_LENGTH][BEFORE] == _MAX:
       stats[MIN_LENGTH][BEFORE] = 0
    if stats[MIN_LENGTH][AFTER] == _MAX:
       stats[MIN_LENGTH][AFTER] = 0

    fprintf(logfile, "@INFO\tBEFORE\tAFTER\n");
    fprintf(logfile, "%s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER]));
    fprintf(logfile, "%s\n", NUMSEQ_SHORTER   + '\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER]))
    fprintf(logfile, "%s\n", AVG_LENGTH +'\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t'+ str(stats[AVG_LENGTH][AFTER]))
    fprintf(logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER]))
    fprintf(logfile, "%s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' +  str(stats[MAX_LENGTH][AFTER]))
    fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n");
#    fprintf(logfile, "#   ---\t-----\t--------\t---------\t----------\n");

    i  = 30
    length_cumulative_distribution[i] = length_cumulative_distribution[i];
    i  -= 1
    while i >= 0:
       length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i];
       i -= 1

    for i in range(0,31):
       fprintf(logfile, "   %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\
                str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) )

    logfile.close()


    if opts.seqtype=='nucleotide':
       priority = 1000
    else:
       priority = 2000

    if runstatslogger != None:
       if opts.seqtype=='nucleotide':
         runstatslogger.write("%s\tNumber of sequences in input file BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype,  str(stats[NUMSEQ][BEFORE])) )
         runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) )
         runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE]))))
         runstatslogger.write("%s\t-max length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) )
         runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE]))))

         runstatslogger.write("%s\tNumber of sequences AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER])))
         runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) )
         runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER]))))
         runstatslogger.write("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) )
         runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) ))
       else:
         runstatslogger.write("%s\tNumber of translated ORFs BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype,  str(stats[NUMSEQ][BEFORE])) )
         runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) )
         runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE]))))
         runstatslogger.write("%s\t-max length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) )
         runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE]))))
         runstatslogger.write("%s\tNumber of tranlated ORFs AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER])))
         runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) )
         runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER]))))
         runstatslogger.write("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) )
         runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) ))
def main(argv, errorlogger=None):
    global parser
    (opts, args) = parser.parse_args(argv)

    if not valid_arguments(opts, args):
        print usage
        sys.exit(0)

    sample_name = opts.sample_name
    folder_path = opts.folder_path
    results = []

    try:
        STEP_NAME = "GATHER_STATS"
        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'nuc')
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tCannot read nuc stats file\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the nucleotide seequences
        status = get_stats_from_stats_file(sample_name, folder_path, 'amino')
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tCannot read amino stats file\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the blast/last hits
        status = get_BLAST_LAST_hits(sample_name, folder_path)
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tReading BLAST HITS\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the selected parsed blast/last hits
        status = get_BLAST_LAST_parsed_hits(sample_name, folder_path)
        if status != None:
            results += status
        else:
            errorlogger.write(
                "%s\tERROR\tReading parsed BLAST HITS\t%s" %
                (STEP_NAME, folder_path + PATHDELIM + sample_name))
            exit_process()

        # read the annotated gff hits
        status = get_annotation_hits(sample_name, folder_path)
        if status != None:
            results += status

        # read the annotated gff hits
        status = get_functional_taxonomic_hits(sample_name, folder_path)
        if status != None:
            results += status

        # read the number of ORFs that are used for mapping to functional categories
        status = get_ORF_annotations_hits(sample_name, folder_path)
        if status != None:
            results += status

        # get the rRNA hits
        status = get_rRNA_hits(sample_name, folder_path)
        if status != None:
            results += status

        # get the tRNA hits
        status = get_tRNA_hits(sample_name, folder_path)
        if status != None:
            results += status

        stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt'

        try:

            statsfilename = open(stats_file_name, 'w')
        except:
            print "ERRROR : Cannot open stats file format " + stats_file_name
            sys.exit(0)

        for pair in results:
            fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1])
        statsfilename.close()
    except:
        exit_process()
def create_annotation(dbname_weight,
                      results_dictionary,
                      input_gff,
                      rRNA_16S_stats_files,
                      tRNA_stats_files,
                      output_gff,
                      output_comparative_annotation,
                      contig_lengths,
                      compact_output=False):
    orf_dictionary = {}
    #    process_gff_file(input_gff, orf_dictionary)
    gffreader = GffFileParser(input_gff)

    output_gff_tmp = output_gff + ".tmp"
    outputgff_file = open(output_gff_tmp, 'w')
    output_comp_annot_file1 = open(output_comparative_annotation + '.1.txt',
                                   'w')
    output_comp_annot_file2 = open(output_comparative_annotation + '.2.txt',
                                   'w')

    output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue'
    fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str)

    output_comp_annot_file2_Str = 'orf_id'
    dbnames = dbname_weight.keys()
    for dbname in dbnames:
        weight = dbname_weight[dbname]
        output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(
            dbname)
    fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str)

    #    gffreader = GffReader(input_gff)
    # for dbname in dbnames:
    #   print dbname, len(results_dictionary[dbname].keys())
    #   print results_dictionary[dbname].keys()
    i = 0
    for contig in gffreader:
        count = 0
        for orf in gffreader.orf_dictionary[contig]:
            value = 0.0001
            success = False
            output_comp_annot_file1_Str = ''
            output_comp_annot_file2_Str = ''
            for dbname in dbnames:
                weight = dbname_weight[dbname]
                value = 0
                orf_id = orf['id']
                if orf_id in results_dictionary[dbname]:
                    if value < results_dictionary[dbname][orf_id]['value']:
                        value = results_dictionary[dbname][orf_id]['value']
                        candidatedbname = dbname
                        success = True
                        candidate_orf_pos = count

                        if output_comp_annot_file1_Str:
                            output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                        else:
                            output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, dbname,\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

                        if output_comp_annot_file2_Str:
                            output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                        else:
                            output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id,
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

                else:
                    if not output_comp_annot_file1_Str:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            orf_id, '', '', '', '')

                    if output_comp_annot_file2_Str:
                        output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(
                            '', '', '')
                    else:
                        output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(
                            orf_id, '', '', '', '')

            if success:  # there was a database hit
                fprintf(output_comp_annot_file1, '%s\n',
                        output_comp_annot_file1_Str)
                fprintf(output_comp_annot_file2, '%s\n',
                        output_comp_annot_file2_Str)
                write_annotation_for_orf(outputgff_file,
                                         candidatedbname,
                                         dbname_weight,
                                         results_dictionary,
                                         gffreader.orf_dictionary,
                                         contig,
                                         candidate_orf_pos,
                                         orf_id,
                                         compact_output=compact_output)
            else:  # if it was not  a hit then it is a hypothetical protein
                #print gffreader.orf_dictionary
                write_annotation_for_orf(outputgff_file,
                                         'None',
                                         '0',
                                         results_dictionary,
                                         gffreader.orf_dictionary,
                                         contig,
                                         count,
                                         orf_id,
                                         compact_output=compact_output)

            count += 1  #move to the next orf

        #del orf_dictionary[contig]
    output_comp_annot_file1.close()
    output_comp_annot_file2.close()

    # now deal with the rRNA sequences  if there is rRNA stats file
    if len(rRNA_16S_stats_files) > 0 and contig_lengths:
        rRNA_16S_dictionary = {}
        for rRNA_16S_stats_file in rRNA_16S_stats_files:
            process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary)

        rRNA_dictionary = {}
        add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths)
        write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA')

    # now deal with the tRNA sequences  if there is tRNA stats file
    if len(tRNA_stats_files) > 0 and contig_lengths:
        tRNA_dictionary = {}
        for tRNA_stats_file in tRNA_stats_files:
            process_tRNA_stats(tRNA_stats_file, tRNA_dictionary)

        tRNA_gff_dictionary = {}
        add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths)
        write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA')
        #print tRNA_dictionary

    outputgff_file.close()
    rename(output_gff_tmp, output_gff)
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None):
    linecount = 0
    readerhandles = []

    if verbose:
       eprintf("Processing for database  : %s\n", dbname)

    if len(filenames)==0:
       eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname)
       exit_process()

    try:
       for i in range(len(filenames)):
         #print filenames
         readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
    except OSError:
      eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i])
      exit_process()

    # set error and warning parameters 
    for readerhandle in readerhandles:
        readerhandle.setMaxErrorsLimit(5)
        readerhandle.setErrorAndWarningLogger(errorlogger)
        readerhandle.setSTEP_NAME('PARSE BLAST')

    try:
       outputfile = open(outputfilename, 'w')
       fieldmapHeaderLine = readerhandles[0].getHeaderLine()
       fprintf(outputfile, "%s\n",fieldmapHeaderLine)
    except OSError:
       eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename)
       exit_process()

    values = []
    for i in range(len(filenames)):
       iterate = iter(readerhandles[i])
       try :
          next(iterate)
          line = readerhandles[i].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values.append( (i, orfRanks[shortORFId], line) )
       except:
          outputfile.close()
          return

    S = len(filenames)
    BuildHeap(S, values)

    while S>0:
       try:
          iterate = iter(readerhandles[values[0][0]])
          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          #print fields[0], orfRanks[fields[0]]
          fprintf(outputfile, "%s\n",line)
          next(iterate)

          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values[0] = (values[0][0], orfRanks[shortORFId], line)
       except:
          #import traceback
          #traceback.print_exc()
          #print 'finished ' + str(S)
          values[0] = values[S-1]
          S = S - 1

       if S>0:
          Heapify(values, 0, S)

    #print 'line count ' + str(linecount)
    outputfile.close()
def main(argv, errorlogger = None,  runstatslogger = None):
    global parser
    (opts, args) = parser.parse_args(argv)
    global opts_global
    opts_global = opts
    if not check_arguments(opts, args):
       print usage
       sys.exit(0)


    db_to_map_Maps =  {'cog':opts.input_cog_maps, 'seed':opts.input_seed_maps, 'kegg':opts.input_kegg_maps, 'cazy':opts.input_cazy_maps}


    results_dictionary={}
    dbname_weight={}

    checkOrCreateFolder(opts.output_dir)
    output_table_file = open(opts.output_dir + PATHDELIM +'functional_and_taxonomic_table.txt', 'w')
    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")
    output_table_file.close()

#    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
    listOfOrfs =  get_list_of_queries(opts.input_annotated_gff)
    listOfOrfs.sort(key=lambda tup: tup, reverse=False)

    #printlist(listOfOrfs,5)
    #sys.exit(0)


##### uncomment the following lines
    for dbname, blastoutput in zip(opts.database_name, opts.input_blastout):
      create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose= opts.verbose, errorlogger = errorlogger)
#####

    # process in blocks of size _stride
    lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map)
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')
        blastParsers[dbname].setMaxErrorsLimit(5)
        blastParsers[dbname].setErrorAndWarningLogger(errorlogger)

    # this part of the code computes the occurence of each of the taxons
    # which is use in the later stage is used to evaluate the min support
    # as used in the MEGAN software

    start = 0
    Length = len(listOfOrfs)
    _stride = 100000
    Taxons = {}
    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for i in range(start, last):
          pickorfs[listOfOrfs[i]]= 'root'
       start = last
       #print 'Num of Min support orfs ' + str(start)

       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
          results = re.search(r'refseq', dbname, re.I)
          if results:
          #if True:
            try:
               results_dictionary[dbname]={}
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               #print results_dictionary[dbname].keys()[1:5]
               lca.set_results_dictionary(results_dictionary)
               lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname = dbname )
               for key, taxon  in pickorfs.iteritems():
                   Taxons[key] = taxon
            except:
               eprintf("ERROR: while training for min support tree %s\n", dbname)
               import traceback
               traceback.print_exc()

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')

    # this loop determines the actual/final taxonomy of each of the ORFs 
    # taking into consideration the min support
    filePermTypes= {}
    start = 0
    outputfile = open( opts.output_dir +'/ORF_annotation_table.txt', 'w')


    short_to_long_dbnames = {}
    for dbname in opts.database_name:
      results = re.search(r'^seed', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['seed'] = dbname

      results = re.search(r'^cog', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['cog'] = dbname

      results = re.search(r'^kegg', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['kegg'] = dbname

      results = re.search(r'^cazy', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['cazy'] = dbname

    standard_dbs = ['cog', 'seed', 'kegg', 'cazy']
    standard_db_maps = [opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps]
    field_to_description = {}
    hierarchical_map = {}

    for db in standard_dbs:
      if db in short_to_long_dbnames:
        field_to_description[db] = {}
        hierarchical_map[db] = {}

    for dbname in standard_dbs:
       if dbname in short_to_long_dbnames:
          try:
            read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname])
          except:
            raise
            pass

    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for  i in range(start, last):
          pickorfs[listOfOrfs[i]]= True
       start = last
       gc.collect()
       eprintf("\nMemory used  = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
            try:
               results_dictionary[dbname]={}
               eprintf("Processing database %s...", dbname)
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               eprintf("done\n")
            except:
               import traceback
               traceback.print_exc()
               eprintf("ERROR: %s\n", dbname)
               pass
           # print dbname + ' ' + str(len(results_dictionary[dbname]))

       eprintf("Num orfs processed  : %s\n", str(start))

       # create the annotations now
       orfToContig = {}

       create_annotation(results_dictionary, opts.database_name,  opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca)

       for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
         if std_dbname in short_to_long_dbnames:
              create_table(results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname,  opts.output_dir, hierarchical_map, field_to_description)

#             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)

       print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile)

    for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
       if std_dbname in short_to_long_dbnames:
          print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description,  filePermType = 'w')

    outputfile.close()
    # now remove the temporary files
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        try:
           remove( blastoutput + '.tmp')
        except:
           pass
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)

    if not len(options.blast_files):
        parser.error('At least one taxonomic BLAST output is required')

    if runBlastCommandrRNA(runcommand=runcommand) != 0:
        if errorlogger:
            errorlogger.write(
                "ERROR: Failed to BLAST the sequences against database %s : " %
                (options.tax_databases[0]))
            errorlogger.write("     : " + runcommand)
        exit_process("ERROR: Failed to BLAST the sequences against database %s : "  %(options.tax_databases[0]) +\
                     "     : " + runcommand)

    if not (len(options.tax_databases) == len(options.blast_files)):
        parser.error(
            'Number of taxonomic databases and BLAST outputs should be the same'
        )

    if not options.output:
        parser.error('Output file must be specified')
    # Incredible sanity check

    if not files_exist(options.blast_files):
        sys.exit(0)

    if not files_exist(options.tax_databases):
        sys.exit(0)

    params = {
        'length': int(options.length),
        'similarity': float(options.similarity),
        'evalue': float(options.evalue),
        'bitscore': float(options.bitscore)
    }
    #print params['bitscore']
    table = {}
    for x in range(0, len(options.blast_files)):
        table[options.tax_databases[x]] = {}
        process_blastout_file(options.blast_files[x],
                              options.tax_databases[x],
                              table[options.tax_databases[x]],
                              errorlogger=errorlogger)

    priority = 7000
    reads = {}
    for x in range(0, len(options.blast_files)):
        append_taxonomic_information(options.tax_databases[x],
                                     table[options.tax_databases[x]], params)
        for key in table[options.tax_databases[x]]:
            if len(table[options.tax_databases[x]][key][6]) > 1:
                reads[key] = True

        dbname = re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])
        runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" %
                             (str(priority), dbname, str(len(reads))))
        priority += 1
    outputfile = open(options.output, 'w')
    fprintf(outputfile,
            "#Similarity cutoff :\t" + str(params['similarity']) + '\n')
    fprintf(outputfile, "#Length cutoff :\t" + str(params['length']) + '\n')
    fprintf(outputfile, "#Evalue cutoff :\t" + str(params['evalue']) + '\n')
    fprintf(outputfile,
            "#Bit score cutoff :\t" + str(params['bitscore']) + '\n')
    fprintf(outputfile,
            "#Number of rRNA sequences detected:\t" + str(len(reads)) + '\n\n')

    for x in range(0, len(options.tax_databases)):
        #  printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x]))
        fprintf(outputfile, '\t%s\t\t\t',
                re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]))
    #printf('\n')
    fprintf(outputfile, '\n')

    #printf('%s', 'read')
    for x in range(0, len(options.blast_files)):
        fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start',
                'end', 'similarity', 'evalue', 'bitscore', 'taxonomy')
    fprintf(outputfile, '\n')

    for read in reads:
        #printf('%s', read)
        fprintf(outputfile, '%s', read)
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
                fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s',
                        str(table[options.tax_databases[x]][read][4]),
                        str(table[options.tax_databases[x]][read][5]),
                        str(table[options.tax_databases[x]][read][0]),
                        str(table[options.tax_databases[x]][read][1]),
                        str(table[options.tax_databases[x]][read][2]),
                        str(table[options.tax_databases[x]][read][6]))
            else:
                fprintf(outputfile, '\t-\t-\t-\t-\t-\t-')
        fprintf(outputfile, '\n')
    outputfile.close()

    # collect the exact reads
    database_hits = {}
    for read in reads:
        for x in range(0, len(options.blast_files)):
            if read in table[options.tax_databases[x]]:
                database_hits[read] = [
                    table[options.tax_databases[x]][read][4],
                    table[options.tax_databases[x]][read][5]
                ]

    # pick the hits, trim them according to the match and write them
    if options.fasta:
        selected_sequences = {}
        read_select_fasta_sequences(database_hits, selected_sequences,
                                    options.fasta)
        for read in database_hits:
            selected_sequences[read] = selected_sequences[read][
                database_hits[read][0]:database_hits[read][1]]
        write_selected_sequences(selected_sequences, options.output + '.fasta')
def process_blastoutput(dbname, blastoutput,  mapfile, refscore_file, opts, errorlogger = None):

    blastparser =  BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger)
    blastparser.setMaxErrorsLimit(100)
    blastparser.setErrorAndWarningLogger(errorlogger)
    blastparser.setSTEP_NAME('PARSE BLAST')

    
    fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if opts.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = blastoutput + '.parsed.txt'
    # temporary file is used to deal with incomplete processing of the file
    output_blastoutput_parsed_tmp =  output_blastoutput_parsed + ".tmp"
    try:
        outputfile = open(output_blastoutput_parsed_tmp, 'w') 
    except:
        if errorlogger:
           errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))
        exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname))

    # write the headers out
    fprintf(outputfile, "#%s",'query')
    for field in fields:
         fprintf(outputfile,"\t%s",field)
    fprintf(outputfile, "\n")

    count = 0;
    for data in blastparser:
        if not data:
          continue
        try:
          fprintf(outputfile, "%s",data['query'])
        except:
           print 'data is : ', data, '\n'
           sys.exit()
        for field in fields:
           fprintf(outputfile, "\t%s",data[field])
        fprintf(outputfile, "\n")
        count += 1

    outputfile.close()
    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)


    return count