示例#1
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue, pipeline):
  """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
  genome = utils.loadGenome(query)
  reverseComplementGenome = utils.reverseComplement(genome)
  openForwardLocations, openReverseLocations = calculateIntergenicRegions(len(genome), genes.values(), minLength)
  
  potentialGenes = findPotentialGenes(genome, openForwardLocations, minLength)
  reversePotentialGenes = findPotentialGenes(reverseComplementGenome, openReverseLocations, minLength)
  potentialGenes += map(lambda x: (len(genome)-x[0], len(genome)-x[1]), reversePotentialGenes)
  
  writePotentialGenes(genome, potentialGenes)
  
  result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml", blast, database, eValue, "intergenics.fas", pipeline)
  os.remove("intergenics.fas")
  result = removeCommonStops(result)
  for r in result.values():
    r.intergenic = True
    r.note = "Intergenic"
    r.color = "160 32 240"
  return result
示例#2
0
def findGenes(query, name, blastLocation, database, eValue, genemark, matrix, pipeline):
  """
  query:         File name of the query.
  name:          Name of the genome in the query.
  blastLocation: Location of blast installation.
  database:      Name of the database to search.
  eValue:        E value to use when searching.
  genemark:      Location of the genemark installation.
  matrix:        Name of the matrix to use, or None
  
  
  Uses genemark to predict genes in query and then uses blast with the given eValue
  to find annotations for those genes.  If a matrix is not specified the GC program in
  genemark will be used to select a heuristic matrix.
  """
  genome = utils.loadGenome(query)
  if not matrix:
    gc = int(utils.getGCContent(genome))
    matrix = genemark + "/" + "heuristic_mat/heu_11_" + str(min(max(30, gc), 70)) + ".mat"
  genemarkProcess = subprocess.Popen([genemark + "/gm", "-opq", "-m", matrix, query], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  while genemarkProcess.poll() == None:
    genemarkProcess.stdout.read()
    genemarkProcess.stderr.read()
  removeInvalidGenes(query + ".orf", len(genome))
  modifyFastaHeader(query + ".orf", name)
  
  result = utils.cachedBlast("initialBlasts/" + name + ".blastp.xml", blastLocation, database, eValue, query + ".orf", pipeline)
  os.remove(query + ".orf")
  os.remove(query + ".lst")
  return result
示例#3
0
def findGenes(query, name, blastLocation, database, eValue, genemark, matrix,
              pipeline):
    """
  query:         File name of the query.
  name:          Name of the genome in the query.
  blastLocation: Location of blast installation.
  database:      Name of the database to search.
  eValue:        E value to use when searching.
  genemark:      Location of the genemark installation.
  matrix:        Name of the matrix to use, or None
  
  
  Uses genemark to predict genes in query and then uses blast with the given eValue
  to find annotations for those genes.  If a matrix is not specified the GC program in
  genemark will be used to select a heuristic matrix.
  """
    genome = utils.loadGenome(query)
    if not matrix:
        gc = int(utils.getGCContent(genome))
        matrix = genemark + "/" + "heuristic_mat/heu_11_" + str(
            min(max(30, gc), 70)) + ".mat"
    subprocess.Popen([genemark + "/gm", "-opq", "-m", matrix, query]).wait()
    print
    removeInvalidGenes(query + ".orf", len(genome))
    modifyFastaHeader(query + ".orf", name)

    result = utils.cachedBlast("initialBlasts/" + name + ".blastp.xml",
                               blastLocation, database, eValue, query + ".orf",
                               pipeline)
    os.remove(query + ".orf")
    os.remove(query + ".lst")
    return result
示例#4
0
def findPromoters(query, name):
    """
  query:     Name of the query file.
  name:      Name of the genome.

  return:    A list of Promoter objects for the forward and reverse strands.

  This function uses the Berkeley Drosophila Genome Project website to predict promoters and parses the results into the list of Promoter objects
  that are returned.
  """
    genome = utils.loadGenome(query)
    return cachedPrediction(genome, "promoterPredictions/" + name + ".html")
示例#5
0
def findPromoters(query, name):
  """
  query:     Name of the query file.
  name:      Name of the genome.

  return:    A list of Promoter objects for the forward and reverse strands.

  This function uses the Berkeley Drosophila Genome Project website to predict promoters and parses the results into the list of Promoter objects
  that are returned.
  """
  genome = utils.loadGenome(query)
  return cachedPrediction(genome, "promoterPredictions/" + name + ".html")
示例#6
0
def findPromoters(query, name, scoreCutoff, frame):
  """
  query: Name of the query file.
  name: Name of the genome.
  scoreCutoff: Minimum promoter score value for any promoters.
  frame: A JFrame that may be used as the parent for a JDialog to display messages.  If it is none then messages
         are just printed.

  return: A list of Promoter objects for the forward and reverse strands.
  
  This function uses BPROM to predict promoters and parses the results into the list of Promoter objects
  that are returned. Promoters with a score lower than scoreCutoff are filtered out.
  """
  genome = utils.loadGenome(query)
  forwardResults = cachedBPROM(genome, "promoterPredictions/" + name + ".forward.bprom", frame)
  reverseResults = cachedBPROM(utils.reverseComplement(genome), "promoterPredictions/" + name + ".reverse.bprom", frame)
  reverseResults = map(functools.partial(reverseCoordinates, len(genome)), reverseResults)
  return filter(lambda x: x.score > scoreCutoff, forwardResults + reverseResults)
示例#7
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue,
                    pipeline):
    """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
    genome = utils.loadGenome(query)
    reverseComplementGenome = utils.reverseComplement(genome)
    openForwardLocations, openReverseLocations = calculateIntergenicRegions(
        len(genome), genes.values(), minLength)

    potentialGenes = findPotentialGenes(genome, openForwardLocations,
                                        minLength)
    reversePotentialGenes = findPotentialGenes(reverseComplementGenome,
                                               openReverseLocations, minLength)
    potentialGenes += map(lambda x: (len(genome) - x[0], len(genome) - x[1]),
                          reversePotentialGenes)

    writePotentialGenes(genome, potentialGenes)

    result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml",
                               blast, database, eValue, "intergenics.fas",
                               pipeline)
    os.remove("intergenics.fas")
    result = removeCommonStops(result)
    for r in result.values():
        r.intergenic = True
        r.note = "Intergenic"
        r.color = "160 32 240"
    return result
示例#8
0
def extendGenes(query, genes, name, blast, database, eValue, pipeline):
  """
  query:    File name of the query.
  ganes:    A dictionary that maps query names to Iteration objects
  name:     Name of the genome
  blast:    Location of the installation of blast.
  database: The database to use with blast.
  eValue:   The E Value to use with blast.

  return:   A new dictionary mapping query names to Iteration objects with any better extensions replacing the originals.
  
  This function will search for any possible extensions of the genes in the query.  An extension will replace the original gene in the resulting
  dictionary if it either brings the start of the gene sufficiently close to the end of a previous gene or it has
  a lower eValue.
  """
  genome = utils.loadGenome(query)
  extensions = getExtensions(genome, genes.values())
  
  writeExtensions(genome, extensions)
  extendedGenes = utils.cachedBlast("extendedBlasts/" + name + ".blastp.xml", blast, database, eValue, "extensions.fas", pipeline)
  os.remove("extensions.fas")
  return applyExtensions(genome, genes, extendedGenes)
示例#9
0
def extendGenes(query, genes, name, blast, database, eValue, pipeline):
    """
  query:    File name of the query.
  ganes:    A dictionary that maps query names to Iteration objects
  name:     Name of the genome
  blast:    Location of the installation of blast.
  database: The database to use with blast.
  eValue:   The E Value to use with blast.

  return:   A new dictionary mapping query names to Iteration objects with any better extensions replacing the originals.
  
  This function will search for any possible extensions of the genes in the query.  An extension will replace the original gene in the resulting
  dictionary if it either brings the start of the gene sufficiently close to the end of a previous gene or it has
  a lower eValue.
  """
    genome = utils.loadGenome(query)
    extensions = getExtensions(genome, genes.values())

    writeExtensions(genome, extensions)
    extendedGenes = utils.cachedBlast("extendedBlasts/" + name + ".blastp.xml",
                                      blast, database, eValue,
                                      "extensions.fas", pipeline)
    os.remove("extensions.fas")
    return applyExtensions(genome, genes, extendedGenes)
示例#10
0
  def run(self, blastLocation, genemarkLocation, transtermLocation, tRNAscanLocation, database, eValue, matrix, minLength, scaffoldingDistance, promoterScoreCutoff, queries, swing = False, email = ""):
    """
    blastLocation:       Directory blast was installed in.
    genemarkLocation:    Directory genemark was installed in.
    transtermLocation:   Directory transterm was installed in.
    tRNAscanLocation:    Directory tRNAscan was installed in.
    database:            Name of the blast database to use.
    eValue:              The e value used whenever a blast search is done.
    matrix:              The matrix to use when running genemark.  If None then genemark is run heuristically.
    minLength:           Minimum length of any genes included in the resulting annotation.
    scaffoldingDistance: The maximum length allowed between genes when contiguous regions of genes are being identified
    promoterScoreCutoff: Minimum score allowed for any promoters included in the resulting annotation
    queries:             A list of faster files to process.
    swing:               If true a swing window will be used to updated the user about the pipeline's progress.
    email:               If this is a non-empty string an email will be sent to the address in the string when the pipeline is done.  This will be attempted with the sendmail command on the local computer.
    
    The main pipeline function.  For every query genemark is used to predict genes, these genes are then extended to any preferable starts.  Then the pipeline searches
    for any intergenic genes(genes between those found by genemark) and these are combined with the extended genemark genes.  Then the genes are pruned to remove
    any undesirable genes found in the intergenic stage.  BPROM and Transterm are used to find promoters and terminators, which are then pruned to remove any
    signals which are inside or too far away from any genes.  Next, tRNAscan is used to find any transfer RNAs in the genome.  Finally, all the remaining genes,
    promoters, and terminators are written to an artemis file in the directory of the query with the same name but with a .art extension, and .xml, .html, and
    .xls files will be generating describing the blast results of the final genes.
    """
    self.initializeDisplay(queries, swing)

    try:
      for query in queries:
        name = os.path.splitext(query)[0]
        queryDirectory, name = os.path.split(name)
        
        genome = utils.loadGenome(query)
        swapFileName = "query" + str(id(self)) + ".fas"
        queryFile = open(swapFileName, "w")
        queryFile.write(">" + name + "\n")
        for i in range(0, len(genome), 50):
          queryFile.write(genome[i:min(i+50, len(genome))] + "\n")
        queryFile.close()

        self.updateProgress(query)
        initialGenes = genemark.findGenes(swapFileName, name, blastLocation, database, eValue, genemarkLocation, matrix, self)
      
        self.updateProgress(query)
        extendedGenes = extend.extendGenes(swapFileName, initialGenes, name, blastLocation, database, eValue, self)
    
        self.updateProgress(query)
        intergenicGenes = intergenic.findIntergenics(swapFileName, extendedGenes, name, minLength, blastLocation, database, eValue, self)

        genes = {}
        for k, v in extendedGenes.items() + intergenicGenes.items():
          genes[k] = v
        
        self.updateProgress(query)
        scaffolded = scaffolds.refineScaffolds(genes, scaffoldingDistance)
 
        self.updateProgress(query)
        initialPromoters = promoters.findPromoters(swapFileName, name, promoterScoreCutoff, self.frame)
    
        self.updateProgress(query)
        initialTerminators = terminators.findTerminators(swapFileName, name, genes.values(), transtermLocation)
      
        self.updateProgress(query)
        filteredSignals = signals.filterSignals(scaffolded.values(), initialPromoters + initialTerminators)
        filteredPromoters = filter(lambda x: isinstance(x, promoters.Promoter), filteredSignals)
        filteredTerminators = filter(lambda x: isinstance(x, terminators.Terminator), filteredSignals)

        self.updateProgress(query)
        transferRNAs = rna.findtRNAs(tRNAscanLocation, swapFileName)

        os.remove(swapFileName)

        self.updateProgress(query)
        artemis.writeArtemisFile(os.path.splitext(query)[0] + ".art", genome, scaffolded.values(), filteredPromoters, filteredTerminators, transferRNAs)

        self.updateProgress(query)
        report.report(name, scaffolded, os.path.splitext(query)[0])

      if email:
        if not os.path.isfile("EMAIL_MESSAGE"):
          message = open("EMAIL_MESSAGE", "w")
          message.write("Subject: Annotation Complete\nYour genome has been annotated.\n")
          message.close()
        
        sent = False
        while not sent:
          message = open("EMAIL_MESSAGE", "r")
          sendmailProcess = subprocess.Popen(["/usr/sbin/sendmail", "-F", "Neofelis", "-f", "*****@*****.**", email],
                                             stdin = message,
                                             stdout = subprocess.PIPE)
          result = ""
          nextRead = sendmailProcess.stdout.read()
          while nextRead:
            result += nextRead
            nextRead = sendmailProcess.stdout.read()
          sent = not result.strip()
          message.close()
    
      self.finished()
    except PipelineException:
      return
示例#11
0
    def run(self,
            blastLocation,
            genemarkLocation,
            transtermLocation,
            database,
            eValue,
            matrix,
            minLength,
            scaffoldingDistance,
            ldfCutoff,
            queries,
            swing=False,
            email=""):
        """
    blastLocation:       Directory blast was installed in.
    genemarkLocation:    Directory genemark was installed in.
    transtermLocation:   Directory transterm was installed in.
    database:            Name of the blast database to use.
    eValue:              The e value used whenever a blast search is done.
    matrix:              The matrix to use when running genemark.  If None then genemark is run heuristically.
    minLength:           Minimum length of any genes included in the resulting annotation.
    scaffoldingDistance: The maximum length allowed between genes when contiguous regions of genes are being identified
    ldfCutoff:           Minimum LDF allowed for any promoters included in the resulting annotation
    queries:             A list of faster files to process.
    swing:               If true a swing window will be used to updated the user about the pipeline's progress.
    email:               If this is a non-empty string an email will be sent to the address in the string when the pipeline is done.  The local machine will be used as
                         an SMTP server and this will not work if it isn't.
    
    The main pipeline function.  For every query genemark is used to predict genes, these genes are then extended to any preferable starts.  Then the pipeline searches
    for any intergenic genes(genes between those found by genemark) and these are combined with the extended genemark genes.  Then the genes are pruned to remove
    any undesirable genes found in the intergenic stage.  BPROM and Transterm are used to find promoters and terminators, which are then pruned to remove any
    signals which are inside or too far away from any genes.  Finally, all the remaining genes, promoters, and terminators ar written to an artemis file in the directory
    of the query with the same name but with a .art extension, and .dat and .xls files will be generating describing the blast results of the final genes.
    """
        self.initializeDisplay(queries, swing)

        try:
            for query in queries:
                name = os.path.splitext(query)[0]
                queryDirectory, name = os.path.split(name)

                genome = utils.loadGenome(query)
                swapFileName = "query" + str(id(self)) + ".fas"
                queryFile = open(swapFileName, "w")
                queryFile.write(">" + name + "\n")
                for i in range(0, len(genome), 50):
                    queryFile.write(genome[i:min(i + 50, len(genome))] + "\n")
                queryFile.close()

                self.updateProgress(query)
                initialGenes = genemark.findGenes(swapFileName, name,
                                                  blastLocation, database,
                                                  eValue, genemarkLocation,
                                                  matrix, self)
                #artemis.writeArtemisFile(os.path.splitext(query)[0] + ".genemark.art", genome, initialGenes.values())

                self.updateProgress(query)
                extendedGenes = extend.extendGenes(swapFileName, initialGenes,
                                                   name, blastLocation,
                                                   database, eValue, self)
                #artemis.writeArtemisFile(os.path.splitext(query)[0] + ".extended.art", genome, extendedGenes.values())

                self.updateProgress(query)
                intergenicGenes = intergenic.findIntergenics(
                    swapFileName, extendedGenes, name, minLength,
                    blastLocation, database, eValue, self)
                #artemis.writeArtemisFile(os.path.splitext(query)[0] + ".intergenic.art", genome, intergenicGenes.values())
                genes = {}
                for k, v in extendedGenes.items() + intergenicGenes.items():
                    genes[k] = v

                self.updateProgress(query)
                scaffolded = scaffolds.refineScaffolds(genes,
                                                       scaffoldingDistance)

                self.updateProgress(query)
                initialPromoters = promoters.findPromoters(swapFileName, name)

                self.updateProgress(query)
                initialTerminators = terminators.findTerminators(
                    swapFileName, name, genes.values(), transtermLocation)

                self.updateProgress(query)
                filteredSignals = signals.filterSignals(
                    scaffolded.values(), initialPromoters + initialTerminators)
                filteredPromoters = filter(
                    lambda x: isinstance(x, promoters.Promoter),
                    filteredSignals)
                filteredTerminators = filter(
                    lambda x: isinstance(x, terminators.Terminator),
                    filteredSignals)

                self.updateProgress(query)
                artemis.writeArtemisFile(
                    os.path.splitext(query)[0] + ".art", genome,
                    scaffolded.values(), filteredPromoters,
                    filteredTerminators)

                self.updateProgress(query)
                report.report(name, scaffolded, os.path.splitext(query)[0])

            if email:
                message = MIMEText("Your genome has been annotated.")
                message["Subject"] = "Annotation complete"
                message["From"] = "Neofelis"
                message["To"] = email

                smtp = smtplib.SMTP("tmpl.arizona.edu", 587)
                smtp.ehlo()
                smtp.starttls()
                smtp.ehlo
                smtp.sendmail("Neofelis", [email], message.as_string())
                smtp.close()

            self.finished()
        except PipelineException:
            return