Пример #1
0
def findGenes(query, name, blastLocation, database, eValue, genemark, matrix, pipeline):
  """
  query:         File name of the query.
  name:          Name of the genome in the query.
  blastLocation: Location of blast installation.
  database:      Name of the database to search.
  eValue:        E value to use when searching.
  genemark:      Location of the genemark installation.
  matrix:        Name of the matrix to use, or None
  
  
  Uses genemark to predict genes in query and then uses blast with the given eValue
  to find annotations for those genes.  If a matrix is not specified the GC program in
  genemark will be used to select a heuristic matrix.
  """
  genome = utils.loadGenome(query)
  if not matrix:
    gc = int(utils.getGCContent(genome))
    matrix = genemark + "/" + "heuristic_mat/heu_11_" + str(min(max(30, gc), 70)) + ".mat"
  genemarkProcess = subprocess.Popen([genemark + "/gm", "-opq", "-m", matrix, query], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  while genemarkProcess.poll() == None:
    genemarkProcess.stdout.read()
    genemarkProcess.stderr.read()
  removeInvalidGenes(query + ".orf", len(genome))
  modifyFastaHeader(query + ".orf", name)
  
  result = utils.cachedBlast("initialBlasts/" + name + ".blastp.xml", blastLocation, database, eValue, query + ".orf", pipeline)
  os.remove(query + ".orf")
  os.remove(query + ".lst")
  return result
Пример #2
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue, pipeline):
  """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
  genome = utils.loadGenome(query)
  reverseComplementGenome = utils.reverseComplement(genome)
  openForwardLocations, openReverseLocations = calculateIntergenicRegions(len(genome), genes.values(), minLength)
  
  potentialGenes = findPotentialGenes(genome, openForwardLocations, minLength)
  reversePotentialGenes = findPotentialGenes(reverseComplementGenome, openReverseLocations, minLength)
  potentialGenes += map(lambda x: (len(genome)-x[0], len(genome)-x[1]), reversePotentialGenes)
  
  writePotentialGenes(genome, potentialGenes)
  
  result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml", blast, database, eValue, "intergenics.fas", pipeline)
  os.remove("intergenics.fas")
  result = removeCommonStops(result)
  for r in result.values():
    r.intergenic = True
    r.note = "Intergenic"
    r.color = "160 32 240"
  return result
Пример #3
0
def findGenes(query, name, blastLocation, database, eValue, genemark, matrix,
              pipeline):
    """
  query:         File name of the query.
  name:          Name of the genome in the query.
  blastLocation: Location of blast installation.
  database:      Name of the database to search.
  eValue:        E value to use when searching.
  genemark:      Location of the genemark installation.
  matrix:        Name of the matrix to use, or None
  
  
  Uses genemark to predict genes in query and then uses blast with the given eValue
  to find annotations for those genes.  If a matrix is not specified the GC program in
  genemark will be used to select a heuristic matrix.
  """
    genome = utils.loadGenome(query)
    if not matrix:
        gc = int(utils.getGCContent(genome))
        matrix = genemark + "/" + "heuristic_mat/heu_11_" + str(
            min(max(30, gc), 70)) + ".mat"
    subprocess.Popen([genemark + "/gm", "-opq", "-m", matrix, query]).wait()
    print
    removeInvalidGenes(query + ".orf", len(genome))
    modifyFastaHeader(query + ".orf", name)

    result = utils.cachedBlast("initialBlasts/" + name + ".blastp.xml",
                               blastLocation, database, eValue, query + ".orf",
                               pipeline)
    os.remove(query + ".orf")
    os.remove(query + ".lst")
    return result
Пример #4
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue,
                    pipeline):
    """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
    genome = utils.loadGenome(query)
    reverseComplementGenome = utils.reverseComplement(genome)
    openForwardLocations, openReverseLocations = calculateIntergenicRegions(
        len(genome), genes.values(), minLength)

    potentialGenes = findPotentialGenes(genome, openForwardLocations,
                                        minLength)
    reversePotentialGenes = findPotentialGenes(reverseComplementGenome,
                                               openReverseLocations, minLength)
    potentialGenes += map(lambda x: (len(genome) - x[0], len(genome) - x[1]),
                          reversePotentialGenes)

    writePotentialGenes(genome, potentialGenes)

    result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml",
                               blast, database, eValue, "intergenics.fas",
                               pipeline)
    os.remove("intergenics.fas")
    result = removeCommonStops(result)
    for r in result.values():
        r.intergenic = True
        r.note = "Intergenic"
        r.color = "160 32 240"
    return result
Пример #5
0
def extendGenes(query, genes, name, blast, database, eValue, pipeline):
  """
  query:    File name of the query.
  ganes:    A dictionary that maps query names to Iteration objects
  name:     Name of the genome
  blast:    Location of the installation of blast.
  database: The database to use with blast.
  eValue:   The E Value to use with blast.

  return:   A new dictionary mapping query names to Iteration objects with any better extensions replacing the originals.
  
  This function will search for any possible extensions of the genes in the query.  An extension will replace the original gene in the resulting
  dictionary if it either brings the start of the gene sufficiently close to the end of a previous gene or it has
  a lower eValue.
  """
  genome = utils.loadGenome(query)
  extensions = getExtensions(genome, genes.values())
  
  writeExtensions(genome, extensions)
  extendedGenes = utils.cachedBlast("extendedBlasts/" + name + ".blastp.xml", blast, database, eValue, "extensions.fas", pipeline)
  os.remove("extensions.fas")
  return applyExtensions(genome, genes, extendedGenes)
Пример #6
0
def extendGenes(query, genes, name, blast, database, eValue, pipeline):
    """
  query:    File name of the query.
  ganes:    A dictionary that maps query names to Iteration objects
  name:     Name of the genome
  blast:    Location of the installation of blast.
  database: The database to use with blast.
  eValue:   The E Value to use with blast.

  return:   A new dictionary mapping query names to Iteration objects with any better extensions replacing the originals.
  
  This function will search for any possible extensions of the genes in the query.  An extension will replace the original gene in the resulting
  dictionary if it either brings the start of the gene sufficiently close to the end of a previous gene or it has
  a lower eValue.
  """
    genome = utils.loadGenome(query)
    extensions = getExtensions(genome, genes.values())

    writeExtensions(genome, extensions)
    extendedGenes = utils.cachedBlast("extendedBlasts/" + name + ".blastp.xml",
                                      blast, database, eValue,
                                      "extensions.fas", pipeline)
    os.remove("extensions.fas")
    return applyExtensions(genome, genes, extendedGenes)