예제 #1
0
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \
  in_file = None, compare_msa = None, force_refer_msa = None, cds = None):
    '''
  Function to trim a given multiple sequence alignment according to a number of
  parameters. It may also returns the output file in codons if appropiate
  parameters are used.
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    cmd = ""
    ## Construct a customize trimAl command-line call
    ## If an input CDS file is set, generate the output alignment using such
    ## information
    if cds:
        cmd = ("%s -backtrans %s ") % (cmd, cds)
    if compare_msa:
        cmd = ("%s -compareset %s ") % (cmd, compare_msa)
    if force_refer_msa:
        cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa)
    if in_file:
        cmd = ("%s -in %s ") % (cmd, in_file)
    cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters)

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\tTrimming Input MSA\t%s") % (date), file=logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s") % (label.upper()),
              file=sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    return True
예제 #2
0
def trimmingAlignment(label, binary, parameters, out_file, logFile, replace, \
  in_file = None, compare_msa = None, force_refer_msa = None, cds = None):
  '''
  Function to trim a given multiple sequence alignment according to a number of
  parameters. It may also returns the output file in codons if appropiate
  parameters are used.
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  cmd = ""
  ## Construct a customize trimAl command-line call
  ## If an input CDS file is set, generate the output alignment using such
  ## information
  if cds:
    cmd = ("%s -backtrans %s ") % (cmd, cds)
  if compare_msa:
    cmd = ("%s -compareset %s ") % (cmd, compare_msa)
  if force_refer_msa:
    cmd = ("%s -forceselect %s ") % (cmd, force_refer_msa)
  if in_file:
    cmd = ("%s -in %s ") % (cmd, in_file)
  cmd = ("%s %s -out %s %s") % (binary, cmd, out_file, parameters)

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\tTrimming Input MSA\t%s") % (date), file = logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes[label])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file = logFile)
  logFile.flush()

  return True
예제 #3
0
def alignment(parameters):

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  current_directory = os.getcwd()
  ## Change current directory to the output folder. Any temporary file will be
  ## generated therefore in this folder
  os.chdir(parameters["out_directory"])

  ## Depending on the verbosity level - set the appropriate logfile value
  if not "verbose" in parameters or parameters["verbose"] == 0:
    logFile = open(os.devnull, 'wb')

  ## ALL/logfile
  elif parameters["verbose"] == 1:
    ## Set output filename and log file
    mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+"
    logFile = open(oFile + ".log", mode)

  ## ALL/Stderr
  elif parameters["verbose"] == 2:
    logFile = sys.stderr

  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tMultiple Sequence Alignment\tSTART\t%s"
    + "\n###") % (date), file = logFile)
  logFile.flush()

  ## Get which program/s will be used to align the input sequences. Check such
  ## program/s are listed among the available binaries
  if not "alignment" in parameters:
    sys.exit("ERROR: Check your configuration file. There is no definition for "
      + "the ALIGNMENT step")

  for program in parameters["alignment"]:
    if not program in parameters:
      sys.exit(("ERROR: Selected program '%s' is not available accordding to "
        "the configuration file") % (program))

  ## Check whether "readAl" is available or not. It is useful for sequences
  ## manipulation independently of the input format.
  if not "readal" in parameters:
    sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

  ## Evaluate whether input sequences will be aligned following one direction,
  ## forward - left to right - or both directions meaning forward/reverse
  if isinstance(parameters["both_direction"], str):
    parameters["both_direction"] = parameters["both_direction"].lower() =="true"

  ## Check whether if an special mode has been selected - for instance
  ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
  ## If not mode is define, we will work with a datatype - normally proteins
  if "cds" in parameters and (not "residue_datatype" in parameters or \
    not parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]):
    sys.exit("ERROR: To use an additional CDS file, you should set the <parame"
      + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

  if not "cds" in parameters and parameters["residue_datatype"] in \
    ["prot2codon", "prot2nuc"]:
    sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
      + "'prot2nuc', an input CDS file is needed")

  ## In normal cases, we don't really need to define a specific datatype to
  ## build alignments but we need the variable defined to avoid crashed in some
  ## checks
  if not "residue_datatype" in parameters:
    parameters["residue_datatype"] = ""

  ## Get some information such as number of input sequences and the presence of
  ## selenocysteine/pyrrolysine residues
  numSeqs, selenocys, pyrrolys = check_count_sequences(parameters["in_file"])

  ## Set the minimum number of sequences required to reconstruct an alignment
  min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
    min_seqs_analysis)
  
  ## Finish when there are not enough sequences to make an alignment
  if numSeqs < min_seqs:
    print(("### INFO: It is necessary, at least, %d sequences to "
      + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file = \
      logFile)
    sys.exit(80)

  ## Otherwise, process the input sequence, substitute rare amino-acids and
  ## reverse input sequences when neccesary
  
  ## Reverse input sequences if needed it
  if parameters["both_direction"]:

    ## If get an positive answer means, the reverse sequence file has been
    ## generated and therefore any downstream file should be over-written
    out_file = ("%s.seqs.reverse") % (oFile)

    if reverseSequences(parameters["readal"], parameters["in_file"], \
      out_file, parameters["replace"], logFile):
      parameters["replace"] = True

  ## Substitute rare amino-acids if needed it
  if selenocys or pyrrolys:

    out_file = ("%s.seqs.no_rare_aa") % (oFile)

    ## If the output file has been generated, over-write, if any, downstream
    ## files
    if replaceRareAminoAcids(parameters["in_file"], out_file, \
      parameters["replace"], logFile, parameters["in_letter"]):
      parameters["replace"] = True

    ## If there is a reverse file, replace also the rare amino-acids in that one
    if parameters["both_direction"]:

      in_file = ("%s.seqs.reverse") % (oFile)
      out_file = ("%s.seqs.no_rare_aa.reverse") % (oFile)

      ## Replace any downstream file is the current one is generated again
      if replaceRareAminoAcids(in_file, out_file, parameters["replace"], \
        logFile, parameters["in_letter"]):
        parameters["replace"] = True

  ## Set in which directions alignments will be reconstructed
  directions = ["forward"]
  if parameters["both_direction"]:
    directions.append("reverse")

  generated_alignments = set()
  ## Once all required sequence files has been set-up, proceed to build the
  ## alignments itself.
  for prog in parameters["alignment"]:

    ## Get binary as well as any input parameters for each aligner and the
    ## output file extension
    binary = parameters[prog]

    key = ("%s_params") % (prog)
    params = parameters[key] if key in parameters else ""

    altern_ext = ("%s%s") % (prog[:2], prog[-1])
    extension = file_extension[prog] if prog in file_extension else altern_ext

    ## Generate as many alignments as needed
    for direc in directions:

      ## Set the input file depending on the presence of rare amino-acids
      if direc == "forward":
        in_file = ("%s.seqs.no_rare_aa") % (oFile) if selenocys \
          or pyrrolys else parameters["in_file"]
      else:
        in_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) if selenocys \
          or pyrrolys else ("%s.seqs.reverse") % (oFile)

      out_file = ("%s.alg.%s%s.%s") % (oFile, "no_rare_aa." if selenocys \
        or pyrrolys else "", direc, extension)

      ## Perfom alignment and check whether it has been generated or already
      ## exist
      if perfomAlignment(prog, binary, params, in_file, out_file,
        logFile, parameters["replace"]):
        parameters["replace"] = True

      ## If any Selenocysteine or Pyrrolyseine is present, generate the final
      ## alignment removing the wild cards and putting back the original amino-
      ## acids
      if selenocys or pyrrolys:
        ## Get real output filename
        alt_file = ("%s.alg.%s.%s") % (oFile, direc, extension)

        ## Make the change and record whether files has been generated de-novo
        if replaceRareAminoAcids(out_file, alt_file, parameters["replace"], \
          logFile, parameters["in_letter"], back = True):
          parameters["replace"] = True

        ## We over-write out_file variable with the current outfile name.We will
        ## store such output file in case a meta-alignment has to be generated
        out_file = alt_file

      ## For reverse alignment, get its reverse - meaning get residues according
      ## to the initial order
      if direc == "reverse":
        in_file = ("%s.alg.reverse.%s") % (oFile, extension)
        out_file = ("%s.alg.reverse.forw.%s") % (oFile, extension)

        if reverseSequences(parameters["readal"], in_file, out_file, \
          parameters["replace"], logFile):
          parameters["replace"] = True

      ## Store all output alignments
      generated_alignments.add(out_file)

  if len(generated_alignments) > 1 and "consensus" in parameters:
    prog = parameters["consensus"][0]
    if not prog in parameters:
      sys.exit(("ERROR: Selected program '%s' is not available accordding to "
        "the configuration file") % (prog))

    ## Get binary as well as any input parameters for each aligner and the
    ## output file extension
    binary = parameters[prog]
    prog_params = ("%s_params") % (prog)

    params = parameters[prog_params] if prog_params in parameters else ""
    params = ("%s -aln %s") % (params, " ".join(generated_alignments))

    out_file = ("%s.alg.metalig") % (oFile)
    if perfomAlignment(prog, binary, params, parameters["in_file"], out_file,
      logFile, parameters["replace"]):
      parameters["replace"] = True

    ## Make such untrimmed alignment it is in phylip format
    convertInputFile_Format("readal", parameters["readal"], out_file,out_file,
      "phylip", logFile, parameters["replace"])

  ## Set the current output alignment as the one generated at a previous step
  else:
    out_file = generated_alignments.pop()

    ## Make such untrimmed alignment it is in phylip format
    convertInputFile_Format("readal", parameters["readal"], out_file,out_file,
      "phylip", logFile, parameters["replace"])

  ## Either we have to trim the final alignment or we have to backtranslate to
  ## codons/nucleotides, we will need to check for a program - hopefully
  ## trimAl - to make the job
  if parameters["residue_datatype"] in ["prot2codon","prot2nuc"] or "trimming" \
    in parameters:

    prog = parameters["trimming"][0]
    if not prog in parameters:
      sys.exit(("ERROR: Selected program '%s' is not available accordding to "
        "the configuration file") % (prog))

    ## Get binary as well as any input parameters for each aligner and the
    ## output file extension
    binary = parameters[prog]

  ## If the modes "prot2codon" or "prot2nuc" are selected - backtranslated the
  ## untrimmed/final alignment
  if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:

    prog_params = ("%s_cds") % (prog)
    params = parameters[prog_params] if prog_params in parameters else ""

    if (trimmingAlignment(prog, binary, params, out_file + "_cds", logFile,
      parameters["replace"], in_file = out_file, cds = parameters["cds"])):
      parameters["replace"] = True

    ## Make such untrimmed alignment it is in phylip format
    convertInputFile_Format("readal", parameters["readal"], out_file + "_cds",
      out_file + "_cds" ,"phylip", logFile, parameters["replace"])

  ## If set, trim resulting alignment
  if "trimming" in parameters:
    prog = parameters["trimming"][0]
    if not prog in parameters:
      sys.exit(("ERROR: Selected program '%s' is not available accordding to "
        "the configuration file") % (prog))

    ## Get binary as well as any input parameters for each aligner and the
    ## output file extension
    prog_params = ("%s_params") % (prog)
    params = parameters[prog_params] if prog_params in parameters else ""

    clean_file = ("%s.alg.clean") % (oFile)

    prog_params = ("%s_compare") % (prog)
    if len(generated_alignments) > 1:
      if prog_params in parameters:
        params = ("%s %s") % (params, parameters[prog_params])

      path_file = ("%s.alg.paths") % (oFile)
      print("\n".join(generated_alignments), file=open(path_file, "w"))

      trimmingAlignment(prog, binary, params, clean_file, logFile,
        parameters["replace"], compare_msa = path_file, force_refer_msa = \
        out_file)

      ## If the backtranslation to codon/nucleotides is required, do it
      if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        prog_params = ("%s_cds") % (prog)
        if prog_params in parameters:
          params = ("%s %s") % (params, parameters[prog_params])

        trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile,
          parameters["replace"], compare_msa = path_file, force_refer_msa = \
          out_file, cds = parameters["cds"])

    else:
      trimmingAlignment(prog, binary, params, clean_file, logFile,
        parameters["replace"], in_file = out_file)

      ## If the backtranslation to codon/nucleotides is required, do it
      if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        prog_params = ("%s_cds") % (prog)
        if prog_params in parameters:
          params = ("%s %s") % (params, parameters[prog_params])

        trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile,
          parameters["replace"], in_file = out_file, cds = parameters["cds"])

    ## After the trimming, set the final output file as the trimmed file
    out_file = clean_file + ("_cds" if parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"] else "")

  final = datetime.datetime.now()
  date = final.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tMultipple Sequence Alignment\tEND\t"
    + "%s") % (date), file = logFile)

  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTOTAL Time\tMultiple Sequence Alignment\t%s"
    + "\n###") % (total), file = logFile)

  ## We just close logfile and clean it up when it is a file
  if "verbose" in parameters and parameters["verbose"] == 1:
    logFile.close()

    ## Clean-up log directory from undesirable lines
    try:
      sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True)
    except OSError:
      print(("ERROR: Impossible to clean-up '%s.log' log file") \
        % (oFile), file = sys.stderr)

  ## Update the input file parameter and return the dictionary containing all
  ## parameters. Those parameters may be used in other steps
  parameters["in_file"] = out_file

  ## Before returning to the main program, get back to the original working
  ## directory
  os.chdir(current_directory)

  return parameters
예제 #4
0
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \
  replace):

  '''
  Function to format the command-line of different multiple sequence alignment
  programs and execute such command lines. It is also support a generic call
  for those programs which has no specific support in the pipeline
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  if label in ["muscle", "kalign"]:
    cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file, out_file)

  elif label in ["clustalw"]:
    cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \
      out_file)

  elif label in ["clustal_omega"]:
    cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file, out_file)

  ## elif label in ["mafft", "dialign_tx"]:
  elif label in ["mafft"]:
    cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file)

  elif label in ["prank"]:
    cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file)

  ## Starting for newer DiAlign-TX versions
  elif label in ["dialign_tx"]:
    cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file)

  ## On t-coffee case, we need to set-up some ENV variables to be able to run
  ## smoothly the program
  elif label in ["t_coffee", "m_coffee"]:

    sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell = True)
    drc = ("/tmp/tcoffee/%s") % (getuser())
    sp.call(("mkdir -p -m0777 %s") % (drc), shell = True)
    os.putenv("LOCKDIR_4_TCOFFEE", drc)
    os.putenv("TMP_4_TCOFFEE", drc)

    cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters, out_file)

  ## In any other case, finish with a generic error
  else:
    sys.exit(exit_codes["generic"])

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \
    logFile)
  print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file = logFile)
  logFile.flush()

  try:
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file = sys.stderr)
    sys.exit(exit_codes[label])

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s [exit code != -1]") \
      % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file = logFile)
  logFile.flush()

  ## If we are working with PRANK, move output file - which should have a suffix
  ## depending on the output format
  if label in ["prank"]:
    suffix = "fas" if parameters.find("-f=") == -1 else \
      "nex" if parameters.find("-f=nexus") != -1 else "phy"
    if lookForFile(out_file + ".best." + suffix):
      sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file), shell = True)

  ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the
  ## guide tree generate during the program execution
  if label in ["t_coffee", "m_coffee"]:
    guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1])
    sp.call(("rm -f %s.dnd") % (guide_tree), shell = True)

  ## Check whether the output alignment has been already generated.
  ## In case something goes wrong, remove the output file and finish the
  ## current execution
  if not checkAlignment(in_file, out_file):
    print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \
      out_file), file = sys.stderr)
    print(("ERROR: Execution failed: %s [file check]") % \
      (label.upper()), file = sys.stderr)
    # sp.call(("rm -f %s") % (out_file), shell = True)
    sys.exit(exit_codes[label])

  return True
예제 #5
0
  ## Reconstruct the Multiple Sequence Alignment for the selected sequences
  parameters.update(alignment(parameters))

  ## Assign which step is being executed. It is useful to know whether the log
  ## file should be replaced or not - even when the flag "replace" is set
  parameters["step"] = 2

  ## Reconstruct the Multiple Sequence Alignment for the input Sequences
  phylogenetic_trees(parameters)

  ## Get final time
  final = datetime.datetime.now()

  ## We return a DELTA object comparing both timestamps
  steps = "', '".join(args.steps)
  total = format_time(final - start if start else 0)

  ## Dump into stderr - when requested all verbose info or just stderr
  if parameters["verbose"] > 0:
    print(("\n###\tTOTAL Time\t[ '%s' ]\t%s\n###") % (steps, total), file = \
      sys.stderr)

  ## Dump into logfile - when requested all verbose info or just logfile
  if parameters["verbose"] == 1:
    ## Get output folder/generic filename - Set output filename and log file
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])
    logFile = open(oFile + ".log", "a+")
    print(("\n###\tTOTAL Time\t[ '%s' ]\t%s\n###") % (steps, total), file = \
      logFile)
    logFile.close()
예제 #6
0
def phylogenetic_trees(parameters):
    ''' Phylogenetic trees are reconstructed according to the input parameters.
      Once the different files have been generated, the function moves those
      files into a pre-established filename schema
  '''

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    current_directory = os.getcwd()
    ## Change current directory to the output folder. Any temporary file will be
    ## generated therefore in this folder
    os.chdir(parameters["out_directory"])

    ## Depending on the verbosity level - set the appropriate logfile value
    if not "verbose" in parameters or parameters["verbose"] == 0:
        logFile = open(os.devnull, 'wb')

    ## ALL/logfile
    elif parameters["verbose"] == 1:
        ## Set output filename and log file
        mode = "w" if parameters["replace"] and parameters[
            "step"] == 0 else "a+"
        logFile = open(oFile + ".log", mode)

    ## ALL/Stderr
    elif parameters["verbose"] == 2:
        logFile = sys.stderr

    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t" +
           "%s\n###") % (date),
          file=logFile)
    logFile.flush()

    ## Get which program will be used to reconstruct phylogenetic trees. Check
    ## such program is listed among the available binaries
    if not "tree" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is no definition for "
            + "the Phylogenetic TREE reconstruction step")

    prog = parameters["tree"][0]
    if not prog in parameters:
        sys.exit(
            ("ERROR: Selected program '%s' is not available accordding to the "
             "the configuration file") % (prog))

    ## Get binary as well as any default parameters for the selected program
    binary = parameters[prog]
    key = ("%s_params") % (prog)
    progr_params = parameters[key] if key in parameters else ""

    if not "evol_models" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is no definition for "
            + "the <evol_models> parameter")

    ## If the evolutionary model list is not appropiately formated, do it
    if isinstance(parameters["evol_models"], str):
        parameters["evol_models"] = list(
            map(strip, parameters["evol_models"].split()))

    ## Check if <numb_models parameters is defined and how many models are
    ## requested to be evaluated
    if not "numb_models" in parameters or parameters["numb_models"].lower() \
      == "all":
        parameters["numb_models"] = len(parameters["evol_models"])
    parameters["numb_models"] = int(parameters["numb_models"])

    if not parameters["numb_models"] in range(
            1,
            len(parameters["evol_models"]) + 1):
        sys.exit(
            ("ERROR: Check how many evolutionary models has been asked to re" +
             "construct '%d'") % (parameters["numb_models"]))

    ## Check whether "readAl" is available or not. It is useful for sequences
    ## manipulation independently of the input format.
    if not "readal" in parameters:
        sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

    ## Create a temporary FASTA file which will be used to detect the sequence
    ## number on the input alignment and the presence of rare amino-acids
    TEMPFILE = tempfile.NamedTemporaryFile()
    convertInputFile_Format("readal", parameters["readal"],
                            parameters["in_file"], TEMPFILE.name, "fasta",
                            logFile, parameters["replace"])
    TEMPFILE.flush()

    numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name)

    ## Set the minimum number of sequences required to reconstruct an alignment
    min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
      min_seqs_analysis)

    ## Finish when there are not enough sequences to make an alignment
    if numSeqs < min_seqs:
        print(("### INFO: It is necessary, at least, %d sequences to " +
               "to reconstruct an alignment (%d)") % (min_seqs, numSeqs),
              file=logFile)
        sys.exit(80)

    ## Check which approaches should be used for the phylogenetic reconstruction
    ## and whether there are specific program's parameters for them
    if not "tree_approach" in parameters:
        parameters["tree_approach"] = ["ml"]

    ## Remove potential duplicates and lowercase all approaches for the tree
    ## reconstruction
    parameters["tree_approach"] = set([p.lower() for p in \
      parameters["tree_approach"]])

    ## We will first loot for Neighbour Joining tree reconstruction, then for
    ## Maximum likelihood and then for any other approach defined in the config
    ## file
    tree_approaches = []
    if "nj" in parameters["tree_approach"]:
        tree_approaches.append("nj")
    if "ml" in parameters["tree_approach"]:
        tree_approaches.append("ml")
    others = parameters["tree_approach"] - set(["nj", "ml"])
    if others != set():
        tree_approaches += sorted(others)

    ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are
    ## present in the input alignment
    if prog in ["raxml"]:
        ## If Selenocysteines or Pyrrolysines are present, substitute them by "X"
        if selenocys or pyrrolys:
            out_file = ("%s.no_rare_aa") % (parameters["in_file"])

            if replaceRareAminoAcids(TEMPFILE.name, out_file,
                                     parameters["replace"], logFile,
                                     "U:X O:X"):
                parameters["replace"] = True
            parameters["in_file"] = out_file
        TEMPFILE.close()

    ## When using FastTree force the conversion of input alignment to FASTA format
    ## since it may crash reading standard interleave PHYLIP format files
    if prog in ["fasttree"]:

        in_file_format, aligned = getFileFormat("readal", parameters["readal"], \
          parameters["in_file"], logFile)

        if in_file_format != "fasta":
            out_file = ("%s.fa") % (parameters["in_file"])
            if (convertInputFile_Format("readal", parameters["readal"], \
              parameters["in_file"], out_file, "fasta", logFile,
              parameters["replace"])):
                parameters["replace"] = True
            parameters["in_file"] = out_file

    replace = parameters["replace"]
    selected_models = parameters["evol_models"]
    ## Reconstruct trees for each approach considering evolutionary models order
    ## according their likelihood values
    for approach in tree_approaches:

        ## Save results - we will use such data for selecting the best -if required-
        ## models fitting to the input data
        results = {}

        ## Format the choosen program's parameters according to the default ones and
        ## the specific ones for the current approach
        params = ("%s ") % (progr_params)
        params += parameters[approach] if approach in parameters else ""

        for model in selected_models:
            out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
            stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach,
                                                    model)

            if prog in ["phyml"]:
                exec_params = ("%s -m %s") % (params, model)

            ## Get additional model -if any- for codons
            elif prog in ["codonphyml"]:
                exec_params = ("%s -m %s") % (params, model)

                add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \
                  if p.startswith("fmodel")]

                if len(add_model) == 1:
                    add_model = add_model.pop()
                    model = ("%s_%s") % (model, add_model)
                    out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog,
                                                          approach, model)
                    stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog,
                                                            approach, model)

            elif prog in ["fasttree"]:
                ## On FastTree is selected by default JTT model for AAs - so we don't
                ## set-up that model
                exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \
                  and model.lower() != "jc" else params
                model = model.upper()

            ## In the case of RAxML, we would concatenate the model to an specific
            ## input parameter
            elif prog in ["raxml"]:
                final_model = model
                ## It is possible to add some suffixes to the evolutionary models
                ## in RAxML - There is not better/easy way to code this option
                if "raxml_model_suffix" in parameters:
                    final_model += parameters["raxml_model_suffix"]
                exec_params = " ".join([
                    ("-%s%s") % (p, final_model if p.startswith("m ") else "")
                    for p in map(strip, params.split("-")) if p
                ])

            ## Build the phylogenetic tree using any of the available methods and
            ## register if any downstream file should be redone.
            if perform_tree(prog, binary, exec_params, parameters["in_file"],
                            out_file, stats_file, logFile,
                            parameters["replace"]):
                replace = True

            ## Get the likelihood for each of the reconstructed models
            log_lk = get_likelihood(prog, stats_file)

            if not log_lk:
                print(("ERROR: Impossible to the Log likelihood values "
                  + "for '%s' model using this program '%s'") % (model, prog), file = \
                  sys.stderr)
                sys.exit(exit_codes[prog])

            results.setdefault(model, log_lk)

        ## Get the models sorted by their likelihood values
        records = sorted(iter(results.items()),
                         key=itemgetter(1),
                         reverse=True)

        ## Set the filename which stores the ranking
        rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach)

        update = False
        ## Check the content of the rankings file - if any.
        ## Marked the file as updatable if there is any discrepancy
        if not replace and lookForFile(rank_file):

            old_content = "\n".join([
                "\t".join(list(map(strip, line.split("\t"))))
                for line in open(rank_file, "rU")
            ])

            newly_generated = "\n".join([("%s\t%s") % (r[0], r[1])
                                         for r in records])

            ## Decide whether ranking file should be updated after comparing current
            ## content with newly generated content
            update = old_content != newly_generated

        ## If the file containing the ranking doesn't exist, generate it.
        ## Update the file content if the replace flag is set to true or the content
        ## has changed - since the phylogenetic tree reconstruction step is the most
        ## expensive one - in terms of time/memory consumption - we are not setting
        ## replace flag to True even when this file is generated/updated. On this
        ## way, we can take adventage of any tree generated in any downstream step.
        if not lookForFile(rank_file) or replace or update:

            out_file = open(rank_file, "w")
            print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \
              out_file)
            out_file.close()

            ## We could set the replace flag to True. However, if any tree has been
            ## generated 'de novo' during this iteration, then the flag is already set
            ## to True.
            #~ parameters["replace"] = True

        ## Select a given number of models for the next iteration - if any
        selected_models = [
            pair[0] for pair in records[:parameters["numb_models"]]
        ]

        ## Remove the Codon Frequency model from potential new iterations
        if prog in ["codonphyml"] and add_model:
            selected_models = [
                m.replace("_" + add_model, "") for m in selected_models
                if m.endswith(add_model)
            ]

    final = datetime.datetime.now()
    date = final.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t" + "%s") %
          (date),
          file=logFile)

    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s" + "\n###") %
          (total),
          file=logFile)
    ## We just close logfile and clean it up when it is a file
    if "verbose" in parameters and parameters["verbose"] == 1:
        logFile.close()

        ## Clean-up log directory from undesirable lines
        try:
            sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True)
        except OSError:
            print(("ERROR: Impossible to clean-up '%s.log' log file") \
              % (oFile), file=sys.stderr)

    ## Before returning to the main program, get back to the original working
    ## directory
    os.chdir(current_directory)

    return parameters
예제 #7
0
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \
  logFile, replace):
    '''
  Function to format the command-line of different phylogenetic tree reconstruc-
  tion programs and execute such command lines.
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["phyml", "codonphyml"]:
        cmd = ("%s -i %s %s") % (binary, in_file, parameters)

    elif label in ["fasttree"]:
        cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \
          out_file, in_file)

    elif label in ["raxml"]:
        random_seed = randint(1, 10000)
        suffix = ("%s_%d") % (label, random_seed)

        cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \
          parameters)

    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \
      file = logFile)
    print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \
      logFile)
    logFile.flush()

    try:
        ## We add a small pipeline to avoid informatin written in the same line
        proc = sp.Popen(cmd,
                        shell=True,
                        stderr=logFile,
                        stdout=logFile,
                        stdin=sp.PIPE)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    proc.communicate(b'\n\nY\n')

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s") % (label.upper()),
              file=sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    ## Process program's output and rename output files according to our own
    ## scheme
    if label in ["phyml", "codonphyml"]:

        ## Since resulting tree/stats file have slightly changed between version,
        ## we have to control for that.
        tree_file = ("%s_%s_tree.txt") % (in_file, label)
        sts_file = ("%s_%s_stats.txt") % (in_file, label)
        if not lookForFile(tree_file, attempts=2):
            tree_file = ("%s_%s_tree") % (in_file, label)
            sts_file = ("%s_%s_stats") % (in_file, label)

        try:
            sp.call(("mv %s %s") % (tree_file, out_file), shell=True)
            sp.call(("mv %s %s") % (sts_file, stats_file), shell=True)
        except OSError:
            print(("ERROR: Impossible to rename '%s' output files") \
              % (label.upper()), file=sys.stderr)
            sys.exit(exit_codes[label])

    elif label in ["raxml"]:
        try:
            sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file),
                    shell=True)
            sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell=True)
        except OSError:
            print(("ERROR: Impossible to rename RAxML output files"), file = \
              sys.stderr)
            sys.exit(exit_codes[label])

        oFile = open(stats_file, "a+")
        for oth_file in listDirectory(os.path.split(stats_file)[0], suffix):
            fileName = os.path.split(oth_file)[1]
            hz_line = "#" * (len(fileName) + 4)
            print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file=oFile)
            print(("%s") % ("".join(open(oth_file, "rU").readlines())),
                  file=oFile)
            sp.call(("rm -f %s") % (oth_file), shell=True)
        oFile.close()

    return True
예제 #8
0
def alignment(parameters):

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    current_directory = os.getcwd()
    ## Change current directory to the output folder. Any temporary file will be
    ## generated therefore in this folder
    os.chdir(parameters["out_directory"])

    ## Depending on the verbosity level - set the appropriate logfile value
    if not "verbose" in parameters or parameters["verbose"] == 0:
        logFile = open(os.devnull, 'wb')

    ## ALL/logfile
    elif parameters["verbose"] == 1:
        ## Set output filename and log file
        mode = "w" if parameters["replace"] and parameters[
            "step"] == 0 else "a+"
        logFile = open(oFile + ".log", mode)

    ## ALL/Stderr
    elif parameters["verbose"] == 2:
        logFile = sys.stderr

    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")
    print(
        ("###\n###\tSTEP\tMultiple Sequence Alignment\tSTART\t%s" + "\n###") %
        (date),
        file=logFile)
    logFile.flush()

    ## Get which program/s will be used to align the input sequences. Check such
    ## program/s are listed among the available binaries
    if not "alignment" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is no definition for "
            + "the ALIGNMENT step")

    for program in parameters["alignment"]:
        if not program in parameters:
            sys.exit(
                ("ERROR: Selected program '%s' is not available accordding to "
                 "the configuration file") % (program))

    ## Check whether "readAl" is available or not. It is useful for sequences
    ## manipulation independently of the input format.
    if not "readal" in parameters:
        sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

    ## Evaluate whether input sequences will be aligned following one direction,
    ## forward - left to right - or both directions meaning forward/reverse
    if isinstance(parameters["both_direction"], str):
        parameters["both_direction"] = parameters["both_direction"].lower(
        ) == "true"

    ## Check whether if an special mode has been selected - for instance
    ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
    ## If not mode is define, we will work with a datatype - normally proteins
    if "cds" in parameters and (not "residue_datatype" in parameters or \
      not parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]):
        sys.exit(
            "ERROR: To use an additional CDS file, you should set the <parame"
            + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

    if not "cds" in parameters and parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"]:
        sys.exit(
            "ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
            + "'prot2nuc', an input CDS file is needed")

    ## In normal cases, we don't really need to define a specific datatype to
    ## build alignments but we need the variable defined to avoid crashed in some
    ## checks
    if not "residue_datatype" in parameters:
        parameters["residue_datatype"] = ""

    ## Get some information such as number of input sequences and the presence of
    ## selenocysteine/pyrrolysine residues
    numSeqs, selenocys, pyrrolys = check_count_sequences(parameters["in_file"])

    ## Set the minimum number of sequences required to reconstruct an alignment
    min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
      min_seqs_analysis)

    ## Finish when there are not enough sequences to make an alignment
    if numSeqs < min_seqs:
        print(("### INFO: It is necessary, at least, %d sequences to "
          + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file = \
          logFile)
        sys.exit(80)

    ## Otherwise, process the input sequence, substitute rare amino-acids and
    ## reverse input sequences when neccesary

    ## Reverse input sequences if needed it
    if parameters["both_direction"]:

        ## If get an positive answer means, the reverse sequence file has been
        ## generated and therefore any downstream file should be over-written
        out_file = ("%s.seqs.reverse") % (oFile)

        if reverseSequences(parameters["readal"], parameters["in_file"], \
          out_file, parameters["replace"], logFile):
            parameters["replace"] = True

    ## Substitute rare amino-acids if needed it
    if selenocys or pyrrolys:

        out_file = ("%s.seqs.no_rare_aa") % (oFile)

        ## If the output file has been generated, over-write, if any, downstream
        ## files
        if replaceRareAminoAcids(parameters["in_file"], out_file, \
          parameters["replace"], logFile, parameters["in_letter"]):
            parameters["replace"] = True

        ## If there is a reverse file, replace also the rare amino-acids in that one
        if parameters["both_direction"]:

            in_file = ("%s.seqs.reverse") % (oFile)
            out_file = ("%s.seqs.no_rare_aa.reverse") % (oFile)

            ## Replace any downstream file is the current one is generated again
            if replaceRareAminoAcids(in_file, out_file, parameters["replace"], \
              logFile, parameters["in_letter"]):
                parameters["replace"] = True

    ## Set in which directions alignments will be reconstructed
    directions = ["forward"]
    if parameters["both_direction"]:
        directions.append("reverse")

    generated_alignments = set()
    ## Once all required sequence files has been set-up, proceed to build the
    ## alignments itself.
    for prog in parameters["alignment"]:

        ## Get binary as well as any input parameters for each aligner and the
        ## output file extension
        binary = parameters[prog]

        key = ("%s_params") % (prog)
        params = parameters[key] if key in parameters else ""

        altern_ext = ("%s%s") % (prog[:2], prog[-1])
        extension = file_extension[
            prog] if prog in file_extension else altern_ext

        ## Generate as many alignments as needed
        for direc in directions:

            ## Set the input file depending on the presence of rare amino-acids
            if direc == "forward":
                in_file = ("%s.seqs.no_rare_aa") % (oFile) if selenocys \
                  or pyrrolys else parameters["in_file"]
            else:
                in_file = ("%s.seqs.no_rare_aa.reverse") % (oFile) if selenocys \
                  or pyrrolys else ("%s.seqs.reverse") % (oFile)

            out_file = ("%s.alg.%s%s.%s") % (oFile, "no_rare_aa." if selenocys \
              or pyrrolys else "", direc, extension)

            ## Perfom alignment and check whether it has been generated or already
            ## exist
            if perfomAlignment(prog, binary, params, in_file, out_file,
                               logFile, parameters["replace"]):
                parameters["replace"] = True

            ## If any Selenocysteine or Pyrrolyseine is present, generate the final
            ## alignment removing the wild cards and putting back the original amino-
            ## acids
            if selenocys or pyrrolys:
                ## Get real output filename
                alt_file = ("%s.alg.%s.%s") % (oFile, direc, extension)

                ## Make the change and record whether files has been generated de-novo
                if replaceRareAminoAcids(out_file, alt_file, parameters["replace"], \
                  logFile, parameters["in_letter"], back = True):
                    parameters["replace"] = True

                ## We over-write out_file variable with the current outfile name.We will
                ## store such output file in case a meta-alignment has to be generated
                out_file = alt_file

            ## For reverse alignment, get its reverse - meaning get residues according
            ## to the initial order
            if direc == "reverse":
                in_file = ("%s.alg.reverse.%s") % (oFile, extension)
                out_file = ("%s.alg.reverse.forw.%s") % (oFile, extension)

                if reverseSequences(parameters["readal"], in_file, out_file, \
                  parameters["replace"], logFile):
                    parameters["replace"] = True

            ## Store all output alignments
            generated_alignments.add(out_file)

    if len(generated_alignments) > 1 and "consensus" in parameters:
        prog = parameters["consensus"][0]
        if not prog in parameters:
            sys.exit(
                ("ERROR: Selected program '%s' is not available accordding to "
                 "the configuration file") % (prog))

        ## Get binary as well as any input parameters for each aligner and the
        ## output file extension
        binary = parameters[prog]
        prog_params = ("%s_params") % (prog)

        params = parameters[prog_params] if prog_params in parameters else ""
        params = ("%s -aln %s") % (params, " ".join(generated_alignments))

        out_file = ("%s.alg.metalig") % (oFile)
        if perfomAlignment(prog, binary, params, parameters["in_file"],
                           out_file, logFile, parameters["replace"]):
            parameters["replace"] = True

        ## Make such untrimmed alignment it is in phylip format
        convertInputFile_Format("readal", parameters["readal"], out_file,
                                out_file, "phylip", logFile,
                                parameters["replace"])

    ## Set the current output alignment as the one generated at a previous step
    else:
        out_file = generated_alignments.pop()

        ## Make such untrimmed alignment it is in phylip format
        convertInputFile_Format("readal", parameters["readal"], out_file,
                                out_file, "phylip", logFile,
                                parameters["replace"])

    ## Either we have to trim the final alignment or we have to backtranslate to
    ## codons/nucleotides, we will need to check for a program - hopefully
    ## trimAl - to make the job
    if parameters["residue_datatype"] in ["prot2codon","prot2nuc"] or "trimming" \
      in parameters:

        prog = parameters["trimming"][0]
        if not prog in parameters:
            sys.exit(
                ("ERROR: Selected program '%s' is not available accordding to "
                 "the configuration file") % (prog))

        ## Get binary as well as any input parameters for each aligner and the
        ## output file extension
        binary = parameters[prog]

    ## If the modes "prot2codon" or "prot2nuc" are selected - backtranslated the
    ## untrimmed/final alignment
    if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:

        prog_params = ("%s_cds") % (prog)
        params = parameters[prog_params] if prog_params in parameters else ""

        if (trimmingAlignment(prog,
                              binary,
                              params,
                              out_file + "_cds",
                              logFile,
                              parameters["replace"],
                              in_file=out_file,
                              cds=parameters["cds"])):
            parameters["replace"] = True

        ## Make such untrimmed alignment it is in phylip format
        convertInputFile_Format("readal", parameters["readal"],
                                out_file + "_cds", out_file + "_cds", "phylip",
                                logFile, parameters["replace"])

    ## If set, trim resulting alignment
    if "trimming" in parameters:
        prog = parameters["trimming"][0]
        if not prog in parameters:
            sys.exit(
                ("ERROR: Selected program '%s' is not available accordding to "
                 "the configuration file") % (prog))

        ## Get binary as well as any input parameters for each aligner and the
        ## output file extension
        prog_params = ("%s_params") % (prog)
        params = parameters[prog_params] if prog_params in parameters else ""

        clean_file = ("%s.alg.clean") % (oFile)

        prog_params = ("%s_compare") % (prog)
        if len(generated_alignments) > 1:
            if prog_params in parameters:
                params = ("%s %s") % (params, parameters[prog_params])

            path_file = ("%s.alg.paths") % (oFile)
            print("\n".join(generated_alignments), file=open(path_file, "w"))

            trimmingAlignment(prog, binary, params, clean_file, logFile,
              parameters["replace"], compare_msa = path_file, force_refer_msa = \
              out_file)

            ## If the backtranslation to codon/nucleotides is required, do it
            if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
                prog_params = ("%s_cds") % (prog)
                if prog_params in parameters:
                    params = ("%s %s") % (params, parameters[prog_params])

                trimmingAlignment(prog, binary, params, clean_file + "_cds", logFile,
                  parameters["replace"], compare_msa = path_file, force_refer_msa = \
                  out_file, cds = parameters["cds"])

        else:
            trimmingAlignment(prog,
                              binary,
                              params,
                              clean_file,
                              logFile,
                              parameters["replace"],
                              in_file=out_file)

            ## If the backtranslation to codon/nucleotides is required, do it
            if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
                prog_params = ("%s_cds") % (prog)
                if prog_params in parameters:
                    params = ("%s %s") % (params, parameters[prog_params])

                trimmingAlignment(prog,
                                  binary,
                                  params,
                                  clean_file + "_cds",
                                  logFile,
                                  parameters["replace"],
                                  in_file=out_file,
                                  cds=parameters["cds"])

        ## After the trimming, set the final output file as the trimmed file
        out_file = clean_file + ("_cds" if parameters["residue_datatype"] in \
          ["prot2codon", "prot2nuc"] else "")

    final = datetime.datetime.now()
    date = final.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tMultipple Sequence Alignment\tEND\t" + "%s") %
          (date),
          file=logFile)

    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTOTAL Time\tMultiple Sequence Alignment\t%s" + "\n###") %
          (total),
          file=logFile)

    ## We just close logfile and clean it up when it is a file
    if "verbose" in parameters and parameters["verbose"] == 1:
        logFile.close()

        ## Clean-up log directory from undesirable lines
        try:
            sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True)
        except OSError:
            print(("ERROR: Impossible to clean-up '%s.log' log file") \
              % (oFile), file = sys.stderr)

    ## Update the input file parameter and return the dictionary containing all
    ## parameters. Those parameters may be used in other steps
    parameters["in_file"] = out_file

    ## Before returning to the main program, get back to the original working
    ## directory
    os.chdir(current_directory)

    return parameters
예제 #9
0
def perfomAlignment(label, binary, parameters, in_file, out_file, logFile, \
  replace):
    '''
  Function to format the command-line of different multiple sequence alignment
  programs and execute such command lines. It is also support a generic call
  for those programs which has no specific support in the pipeline
  '''

    ## Check whether the output file already exists. If it is not set to replace
    ## it, just return to the calling function
    if lookForFile(out_file) and not replace:
        return False

    if label in ["muscle", "kalign"]:
        cmd = ("%s %s -in %s -out %s") % (binary, parameters, in_file,
                                          out_file)

    elif label in ["clustalw"]:
        cmd = ("%s %s -INFILE=%s -OUTFILE=%s") % (binary, parameters, in_file, \
          out_file)

    elif label in ["clustal_omega"]:
        cmd = ("%s %s --in %s --out %s") % (binary, parameters, in_file,
                                            out_file)

    ## elif label in ["mafft", "dialign_tx"]:
    elif label in ["mafft"]:
        cmd = ("%s %s %s > %s") % (binary, parameters, in_file, out_file)

    elif label in ["prank"]:
        cmd = ("%s %s -d=%s -o=%s") % (binary, parameters, in_file, out_file)

    ## Starting for newer DiAlign-TX versions
    elif label in ["dialign_tx"]:
        cmd = ("%s %s %s %s") % (binary, parameters, in_file, out_file)

    ## On t-coffee case, we need to set-up some ENV variables to be able to run
    ## smoothly the program
    elif label in ["t_coffee", "m_coffee"]:

        sp.call(("mkdir -p -m0777 /tmp/tcoffee"), shell=True)
        drc = ("/tmp/tcoffee/%s") % (getuser())
        sp.call(("mkdir -p -m0777 %s") % (drc), shell=True)
        os.putenv("LOCKDIR_4_TCOFFEE", drc)
        os.putenv("TMP_4_TCOFFEE", drc)

        cmd = ("%s %s %s -outfile %s") % (binary, in_file, parameters,
                                          out_file)

    ## In any other case, finish with a generic error
    else:
        sys.exit(exit_codes["generic"])

    ## Record the time and precise command-line
    name = getfqdn()
    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")

    print(("###\n###\t%s - Alignment\t%s") % (label.upper(), date), file = \
      logFile)
    print(("###\t[%s]\tCommand-line\t%s\n###") % (name, cmd), file=logFile)
    logFile.flush()

    try:
        proc = sp.Popen(cmd, shell=True, stderr=logFile, stdout=logFile)
    except OSError as e:
        print("ERROR: Execution failed: " + str(e), file=sys.stderr)
        sys.exit(exit_codes[label])

    if proc.wait() != 0:
        print(("ERROR: Execution failed: %s [exit code != -1]") \
          % (label.upper()), file = sys.stderr)
        sys.exit(exit_codes[label])

    final = datetime.datetime.now()
    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTime\t%s\n###") % (total), file=logFile)
    logFile.flush()

    ## If we are working with PRANK, move output file - which should have a suffix
    ## depending on the output format
    if label in ["prank"]:
        suffix = "fas" if parameters.find("-f=") == -1 else \
          "nex" if parameters.find("-f=nexus") != -1 else "phy"
        if lookForFile(out_file + ".best." + suffix):
            sp.call(("mv %s.best.%s %s") % (out_file, suffix, out_file),
                    shell=True)

    ## If any mode of t_coffee is used: t_coffee or m_coffee, we should remove the
    ## guide tree generate during the program execution
    if label in ["t_coffee", "m_coffee"]:
        guide_tree = ".".join(os.path.split(in_file)[1].split(".")[:-1])
        sp.call(("rm -f %s.dnd") % (guide_tree), shell=True)

    ## Check whether the output alignment has been already generated.
    ## In case something goes wrong, remove the output file and finish the
    ## current execution
    if not checkAlignment(in_file, out_file):
        print(("ERROR: Check input '%s' and output '%s' alignments") % (in_file, \
          out_file), file = sys.stderr)
        print(("ERROR: Execution failed: %s [file check]") % \
          (label.upper()), file = sys.stderr)
        # sp.call(("rm -f %s") % (out_file), shell = True)
        sys.exit(exit_codes[label])

    return True
예제 #10
0
def homology(parameters):

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  current_directory = os.getcwd()
  ## Change current directory to the output folder. Any temporary file will be
  ## generated therefore in this folder
  os.chdir(parameters["out_directory"])
  
  ## Depending on the verbosity level - set the appropriate logfile value
  if not "verbose" in parameters or parameters["verbose"] == 0:
    logFile = open(os.devnull, 'wb')

  ## ALL/logfile
  elif parameters["verbose"] == 1:
    ## Set output filename and log file
    mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+"
    logFile = open(oFile + ".log", mode)

  ## ALL/Stderr
  elif parameters["verbose"] == 2:
    logFile = sys.stderr
    
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file = logFile)
  logFile.flush()

  ## Get which tool will be used to perform the homology search. Check such tool
  ## is listed among the available binaries
  if not "homology" in parameters:
    sys.exit("ERROR: Check your configuration file. There is not tool set for "
      + "the homology search")

  if not parameters["homology"][0] in parameters:
    sys.exit("ERROR: Check your configuration file. This tool '%s' is not among"
      + " available methods")

  ## Check whether if an special mode has been selected - for instance
  ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
  ## If not mode is define, we will work with a datatype - normally proteins
  if "cds" in parameters and not parameters["residue_datatype"] in \
    ["prot2codon", "prot2nuc"]:
    sys.exit("ERROR: To use an additional CDS file, you should set the <parame"
      + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

  if not "cds" in parameters and parameters["residue_datatype"] in \
    ["prot2codon", "prot2nuc"]:
    sys.exit("ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
      + "'prot2nuc', an input CDS file is needed")

  ## If the homology search will use any program from the BLAST package, check
  ## whether the TARGET SEQUENCES file has been already formatted.
  if parameters["homology"][0] in ["legacy_blast", "blast+"]:

    ## Get database sequence type - p: protein or n:nucleotide
    dt = "p" if parameters["residue_datatype"].startswith("prot") else "n"

    ## Check if BLAST DB associated files already exist or not
    for extension in ["hr", "in", "sq"]:
      filename = ("%s.%s%s") % (parameters["db_file"], dt, extension)

      ## If the input file doesn't exist check whether input database has been
      ## split into different volumes
      if not lookForFile(filename):
        alternative = ("%s.00.%s%s") % (parameters["db_file"], dt, extension)
        if not lookForFile(alternative):
          db_file = parameters["db_file"]
          sys.exit(("ERROR: Check your input TARGET SEQUENCES file '%s' has "
            + "been formated using 'formatdb'/'makeblastdb'") % (db_file))

    ## If the homology search step should be perfomed using BLAST, call the
    ## appropiate function
    blast(parameters, logFile)
    tag = "blast"

  elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]:
    hmmer(parameters, logFile)
    ## Set the tag for the output files
    tag = "hmmer"

  ## Check whether the output file contains any result
  homologs = 0
  inFile = ("%s.homology.%s.out") % (oFile, tag)
  for line in open(inFile, "rU"):
    if not line.strip() or line.startswith("#"):
      continue
    homologs += 1
  if not homologs:
    print(("INFO: NO Homologous sequences found for '%s'") % \
      parameters["prefix"], file = sys.stderr)
    sys.exit(80)

  ## Filter homology search data. A dictionary containing selected sequences,
  ## including the sequences themselves
  selected_sequences = filter_results(parameters, logFile)

  ## Generate a MD5 file containing selected sequences for the current run.
  ## MD5s are used to recompute the same phylogenetic tree starting from other
  ## seqs - with identical similarity search results - in the set of homologs
  outFile = ("%s.seqs.md5") % (oFile)

  ## Check whether the file already exists or not.
  if not lookForFile(outFile) or parameters["replace"]:
    parameters["replace"] = True

    seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest()
    print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \
      open(outFile, "w"))

  ## Generate a file containing the selected sequences after performing the
  ## homology search and filtering its output according to a set of parameters.
  outFile = ("%s.seqs") % (oFile)

  ## Check whether the file already exists or not.
  if not lookForFile(outFile) or parameters["replace"]:
    parameters["replace"] = True

    output_file = open(outFile, "w")
    for seqId in sorted(selected_sequences):
      print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \
      output_file)
    output_file.close()

  ## If a CDS input file is set, use it to associate to homologous protein
  ## sequences their corresponding CDS
  if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
    cdsFile = ("%s.seqs_cds") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(cdsFile) or parameters["replace"]:
      parameters["replace"] = True

      output_file = open(cdsFile, "w")
      found = set()
      for record in SeqIO.parse(parameters["cds"], "fasta"):
        if not record.id in selected_sequences:
          continue
        seq = splitSequence(str(record.seq))
        print((">%s\n%s") % (record.id, seq), file = output_file)
        found.add(record.id)
      output_file.close()

      if set(selected_sequences.keys()) - found != set():
        missed = ",".join(sorted(set(selected_sequences.keys()) - found))
        sys.exit(("ERROR: Check your input CDS file '%s'. Impossible to find "
          "homologs sequences [missing:'%s']") % (parameters["cds"], missed))

  ## Print how much time was needed to perform the whole homology search step
  final = datetime.datetime.now()
  date  = final.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file = logFile)

  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file = logFile)

  ## We just close logfile and clean it up when it is a file
  if "verbose" in parameters and parameters["verbose"] == 1:
    logFile.close()

    ## Clean-up log directory from undesirable lines
    try:
      sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True)
    except OSError:
      print(("ERROR: Impossible to clean-up '%s.log' log file") \
        % (oFile), file = sys.stderr)

  ## Update the input file parameter and return the dictionary containing all
  ## parameters. Those parameters may be used in other steps
  parameters["in_file"] = outFile

  ## Update the associate CDS file with the resulting cds file. It will be used
  ## to make the back-translation in a hypothetical MSA step
  if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
    parameters["cds"] = ("%s.seqs_cds") % (oFile)

  ## Before returning to the main program, get back to the original working
  ## directory
  os.chdir(current_directory)

  return parameters
예제 #11
0
def homology(parameters):

    ## Get output folder/generic filename
    oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

    current_directory = os.getcwd()
    ## Change current directory to the output folder. Any temporary file will be
    ## generated therefore in this folder
    os.chdir(parameters["out_directory"])

    ## Depending on the verbosity level - set the appropriate logfile value
    if not "verbose" in parameters or parameters["verbose"] == 0:
        logFile = open(os.devnull, 'wb')

    ## ALL/logfile
    elif parameters["verbose"] == 1:
        ## Set output filename and log file
        mode = "w" if parameters["replace"] and parameters[
            "step"] == 0 else "a+"
        logFile = open(oFile + ".log", mode)

    ## ALL/Stderr
    elif parameters["verbose"] == 2:
        logFile = sys.stderr

    start = datetime.datetime.now()
    date = start.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tHomology\tSTART\t%s\n###") % (date), file=logFile)
    logFile.flush()

    ## Get which tool will be used to perform the homology search. Check such tool
    ## is listed among the available binaries
    if not "homology" in parameters:
        sys.exit(
            "ERROR: Check your configuration file. There is not tool set for "
            + "the homology search")

    if not parameters["homology"][0] in parameters:
        sys.exit(
            "ERROR: Check your configuration file. This tool '%s' is not among"
            + " available methods")

    ## Check whether if an special mode has been selected - for instance
    ## "prot2codon" or "prot2nuc" - and a CDS file has been defined
    ## If not mode is define, we will work with a datatype - normally proteins
    if "cds" in parameters and not parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"]:
        sys.exit(
            "ERROR: To use an additional CDS file, you should set the <parame"
            + "ter> 'residue_datatype' to either 'prot2codon' or 'prot2nuc'")

    if not "cds" in parameters and parameters["residue_datatype"] in \
      ["prot2codon", "prot2nuc"]:
        sys.exit(
            "ERROR: When 'residue_datatype' is set to either 'prot2codon' or "
            + "'prot2nuc', an input CDS file is needed")

    ## If the homology search will use any program from the BLAST package, check
    ## whether the TARGET SEQUENCES file has been already formatted.
    if parameters["homology"][0] in ["legacy_blast", "blast+"]:

        ## Get database sequence type - p: protein or n:nucleotide
        dt = "p" if parameters["residue_datatype"].startswith("prot") else "n"

        ## Check if BLAST DB associated files already exist or not
        for extension in ["hr", "in", "sq"]:
            filename = ("%s.%s%s") % (parameters["db_file"], dt, extension)

            ## If the input file doesn't exist check whether input database has been
            ## split into different volumes
            if not lookForFile(filename):
                alternative = ("%s.00.%s%s") % (parameters["db_file"], dt,
                                                extension)
                if not lookForFile(alternative):
                    db_file = parameters["db_file"]
                    sys.exit((
                        "ERROR: Check your input TARGET SEQUENCES file '%s' has "
                        + "been formated using 'formatdb'/'makeblastdb'") %
                             (db_file))

        ## If the homology search step should be perfomed using BLAST, call the
        ## appropiate function
        blast(parameters, logFile)
        tag = "blast"

    elif parameters["homology"][0] in ["phmmer", "jackhmmer", "hmmer_search"]:
        hmmer(parameters, logFile)
        ## Set the tag for the output files
        tag = "hmmer"

    ## Check whether the output file contains any result
    homologs = 0
    inFile = ("%s.homology.%s.out") % (oFile, tag)
    for line in open(inFile, "rU"):
        if not line.strip() or line.startswith("#"):
            continue
        homologs += 1
    if not homologs:
        print(("INFO: NO Homologous sequences found for '%s'") % \
          parameters["prefix"], file = sys.stderr)
        sys.exit(80)

    ## Filter homology search data. A dictionary containing selected sequences,
    ## including the sequences themselves
    selected_sequences = filter_results(parameters, logFile)

    ## Generate a MD5 file containing selected sequences for the current run.
    ## MD5s are used to recompute the same phylogenetic tree starting from other
    ## seqs - with identical similarity search results - in the set of homologs
    outFile = ("%s.seqs.md5") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(outFile) or parameters["replace"]:
        parameters["replace"] = True

        seqs_md5 = md5("".join(sorted(selected_sequences.keys()))).hexdigest()
        print(("%s\t%s") % (parameters["prefix"], seqs_md5), file = \
          open(outFile, "w"))

    ## Generate a file containing the selected sequences after performing the
    ## homology search and filtering its output according to a set of parameters.
    outFile = ("%s.seqs") % (oFile)

    ## Check whether the file already exists or not.
    if not lookForFile(outFile) or parameters["replace"]:
        parameters["replace"] = True

        output_file = open(outFile, "w")
        for seqId in sorted(selected_sequences):
            print((">%s\n%s") % (seqId, selected_sequences[seqId][1]), file = \
            output_file)
        output_file.close()

    ## If a CDS input file is set, use it to associate to homologous protein
    ## sequences their corresponding CDS
    if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        cdsFile = ("%s.seqs_cds") % (oFile)

        ## Check whether the file already exists or not.
        if not lookForFile(cdsFile) or parameters["replace"]:
            parameters["replace"] = True

            output_file = open(cdsFile, "w")
            found = set()
            for record in SeqIO.parse(parameters["cds"], "fasta"):
                if not record.id in selected_sequences:
                    continue
                seq = splitSequence(str(record.seq))
                print((">%s\n%s") % (record.id, seq), file=output_file)
                found.add(record.id)
            output_file.close()

            if set(selected_sequences.keys()) - found != set():
                missed = ",".join(
                    sorted(set(selected_sequences.keys()) - found))
                sys.exit((
                    "ERROR: Check your input CDS file '%s'. Impossible to find "
                    "homologs sequences [missing:'%s']") %
                         (parameters["cds"], missed))

    ## Print how much time was needed to perform the whole homology search step
    final = datetime.datetime.now()
    date = final.strftime("%H:%M:%S %m/%d/%y")
    print(("###\n###\tSTEP\tHomology\tEND\t%s") % (date), file=logFile)

    ## We return a DELTA object comparing both timestamps
    total = format_time(final - start if start else 0)
    print(("###\tTOTAL Time\tHomology\t%s\n###") % (total), file=logFile)

    ## We just close logfile and clean it up when it is a file
    if "verbose" in parameters and parameters["verbose"] == 1:
        logFile.close()

        ## Clean-up log directory from undesirable lines
        try:
            sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell=True)
            sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell=True)
        except OSError:
            print(("ERROR: Impossible to clean-up '%s.log' log file") \
              % (oFile), file = sys.stderr)

    ## Update the input file parameter and return the dictionary containing all
    ## parameters. Those parameters may be used in other steps
    parameters["in_file"] = outFile

    ## Update the associate CDS file with the resulting cds file. It will be used
    ## to make the back-translation in a hypothetical MSA step
    if parameters["residue_datatype"] in ["prot2codon", "prot2nuc"]:
        parameters["cds"] = ("%s.seqs_cds") % (oFile)

    ## Before returning to the main program, get back to the original working
    ## directory
    os.chdir(current_directory)

    return parameters
예제 #12
0
def phylogenetic_trees(parameters):
  ''' Phylogenetic trees are reconstructed according to the input parameters.
      Once the different files have been generated, the function moves those
      files into a pre-established filename schema
  '''

  ## Get output folder/generic filename
  oFile = os.path.join(parameters["out_directory"], parameters["prefix"])

  current_directory = os.getcwd()
  ## Change current directory to the output folder. Any temporary file will be
  ## generated therefore in this folder
  os.chdir(parameters["out_directory"])

  ## Depending on the verbosity level - set the appropriate logfile value
  if not "verbose" in parameters or parameters["verbose"] == 0:
    logFile = open(os.devnull, 'wb')

  ## ALL/logfile
  elif parameters["verbose"] == 1:
    ## Set output filename and log file
    mode = "w" if parameters["replace"] and parameters["step"] == 0 else "a+"
    logFile = open(oFile + ".log", mode)

  ## ALL/Stderr
  elif parameters["verbose"] == 2:
    logFile = sys.stderr

  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tSTART\t"
    + "%s\n###") % (date), file=logFile)
  logFile.flush()

  ## Get which program will be used to reconstruct phylogenetic trees. Check
  ## such program is listed among the available binaries
  if not "tree" in parameters:
    sys.exit("ERROR: Check your configuration file. There is no definition for "
      + "the Phylogenetic TREE reconstruction step")

  prog = parameters["tree"][0]
  if not prog in parameters:
    sys.exit(("ERROR: Selected program '%s' is not available accordding to the "
      "the configuration file") % (prog))

  ## Get binary as well as any default parameters for the selected program
  binary = parameters[prog]
  key = ("%s_params") % (prog)
  progr_params = parameters[key] if key in parameters else ""

  if not "evol_models" in parameters:
    sys.exit("ERROR: Check your configuration file. There is no definition for "
      + "the <evol_models> parameter")

  ## If the evolutionary model list is not appropiately formated, do it
  if isinstance(parameters["evol_models"], str):
    parameters["evol_models"] = list(map(strip, parameters["evol_models"].split()))

  ## Check if <numb_models parameters is defined and how many models are
  ## requested to be evaluated
  if not "numb_models" in parameters or parameters["numb_models"].lower() \
    == "all":
    parameters["numb_models"] = len(parameters["evol_models"])
  parameters["numb_models"] = int(parameters["numb_models"])

  if not parameters["numb_models"] in range(1,len(parameters["evol_models"])+1):
    sys.exit(("ERROR: Check how many evolutionary models has been asked to re"
      + "construct '%d'") % (parameters["numb_models"]))

  ## Check whether "readAl" is available or not. It is useful for sequences
  ## manipulation independently of the input format.
  if not "readal" in parameters:
    sys.exit("ERROR: Check your CONFIG file. 'readAl' is not available")

  ## Create a temporary FASTA file which will be used to detect the sequence
  ## number on the input alignment and the presence of rare amino-acids
  TEMPFILE = tempfile.NamedTemporaryFile()
  convertInputFile_Format("readal", parameters["readal"], parameters["in_file"],
    TEMPFILE.name, "fasta", logFile, parameters["replace"])
  TEMPFILE.flush()

  numSeqs, selenocys, pyrrolys = check_count_sequences(TEMPFILE.name)

  ## Set the minimum number of sequences required to reconstruct an alignment
  min_seqs = int(parameters["min_seqs"] if "min_seqs" in parameters else \
    min_seqs_analysis)
  
  ## Finish when there are not enough sequences to make an alignment
  if numSeqs < min_seqs:
    print(("### INFO: It is necessary, at least, %d sequences to "
      + "to reconstruct an alignment (%d)") % (min_seqs, numSeqs), file=logFile)
    sys.exit(80)

  ## Check which approaches should be used for the phylogenetic reconstruction
  ## and whether there are specific program's parameters for them
  if not "tree_approach" in parameters:
    parameters["tree_approach"] = ["ml"]

  ## Remove potential duplicates and lowercase all approaches for the tree
  ## reconstruction
  parameters["tree_approach"] = set([p.lower() for p in \
    parameters["tree_approach"]])

  ## We will first loot for Neighbour Joining tree reconstruction, then for
  ## Maximum likelihood and then for any other approach defined in the config
  ## file
  tree_approaches = []
  if "nj" in parameters["tree_approach"]:
    tree_approaches.append("nj")
  if "ml" in parameters["tree_approach"]:
    tree_approaches.append("ml")
  others = parameters["tree_approach"] - set(["nj", "ml"])
  if others != set():
    tree_approaches += sorted(others)

  ## When using RAxML, it may crash when Selenocysteines or Pyrrolysines are
  ## present in the input alignment
  if prog in ["raxml"]:
    ## If Selenocysteines or Pyrrolysines are present, substitute them by "X"
    if selenocys or pyrrolys:
      out_file = ("%s.no_rare_aa") % (parameters["in_file"])

      if replaceRareAminoAcids(TEMPFILE.name, out_file, parameters["replace"],
        logFile, "U:X O:X"):
        parameters["replace"] = True
      parameters["in_file"] = out_file
    TEMPFILE.close()

  ## When using FastTree force the conversion of input alignment to FASTA format
  ## since it may crash reading standard interleave PHYLIP format files
  if prog in ["fasttree"]:

    in_file_format, aligned = getFileFormat("readal", parameters["readal"], \
      parameters["in_file"], logFile)

    if in_file_format != "fasta":
      out_file = ("%s.fa") % (parameters["in_file"])
      if (convertInputFile_Format("readal", parameters["readal"], \
        parameters["in_file"], out_file, "fasta", logFile,
        parameters["replace"])):
        parameters["replace"] = True
      parameters["in_file"] = out_file

  replace = parameters["replace"]
  selected_models = parameters["evol_models"]
  ## Reconstruct trees for each approach considering evolutionary models order
  ## according their likelihood values
  for approach in tree_approaches:

    ## Save results - we will use such data for selecting the best -if required-
    ## models fitting to the input data
    results = {}

    ## Format the choosen program's parameters according to the default ones and
    ## the specific ones for the current approach
    params = ("%s ") % (progr_params)
    params += parameters[approach] if approach in parameters else ""

    for model in selected_models:
      out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
      stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model)

      if prog in ["phyml"]:
        exec_params = ("%s -m %s") % (params, model)

      ## Get additional model -if any- for codons
      elif prog in ["codonphyml"]:
        exec_params = ("%s -m %s") % (params, model)

        add_model = [p.split()[1] for p in map(strip, exec_params.split("-")) \
          if p.startswith("fmodel")]

        if len(add_model) == 1:
          add_model = add_model.pop()
          model = ("%s_%s") % (model, add_model)
          out_file = ("%s.tree.%s.%s.%s.nw") % (oFile, prog, approach, model)
          stats_file = ("%s.tree.%s.%s.%s.st") % (oFile, prog, approach, model)

      elif prog in ["fasttree"]:
        ## On FastTree is selected by default JTT model for AAs - so we don't
        ## set-up that model
        exec_params = ("%s -%s") % (params, model) if model.lower() != "jtt" \
          and model.lower() != "jc" else params
        model = model.upper()

      ## In the case of RAxML, we would concatenate the model to an specific
      ## input parameter
      elif prog in ["raxml"]:
        final_model = model
        ## It is possible to add some suffixes to the evolutionary models
        ## in RAxML - There is not better/easy way to code this option
        if "raxml_model_suffix" in parameters:
          final_model += parameters["raxml_model_suffix"]
        exec_params = " ".join([("-%s%s") %(p, final_model if p.startswith("m ")
          else "") for p in map(strip, params.split("-")) if p])

      ## Build the phylogenetic tree using any of the available methods and
      ## register if any downstream file should be redone.
      if perform_tree(prog, binary, exec_params, parameters["in_file"],
        out_file, stats_file, logFile, parameters["replace"]):
          replace = True

      ## Get the likelihood for each of the reconstructed models
      log_lk = get_likelihood(prog, stats_file)

      if not log_lk:
        print(("ERROR: Impossible to the Log likelihood values "
          + "for '%s' model using this program '%s'") % (model, prog), file = \
          sys.stderr)
        sys.exit(exit_codes[prog])

      results.setdefault(model, log_lk)

    ## Get the models sorted by their likelihood values
    records = sorted(iter(results.items()), key = itemgetter(1), reverse = True)

    ## Set the filename which stores the ranking
    rank_file = ("%s.tree.%s.rank.%s") % (oFile, prog, approach)

    update = False
    ## Check the content of the rankings file - if any.
    ## Marked the file as updatable if there is any discrepancy
    if not replace and lookForFile(rank_file):

      old_content = "\n".join(["\t".join(list(map(strip, line.split("\t")))) for line
        in open(rank_file, "rU")])      

      newly_generated = "\n".join([("%s\t%s") % (r[0], r[1]) for r in records])
      
      ## Decide whether ranking file should be updated after comparing current
      ## content with newly generated content
      update = old_content != newly_generated

    ## If the file containing the ranking doesn't exist, generate it.
    ## Update the file content if the replace flag is set to true or the content
    ## has changed - since the phylogenetic tree reconstruction step is the most
    ## expensive one - in terms of time/memory consumption - we are not setting
    ## replace flag to True even when this file is generated/updated. On this
    ## way, we can take adventage of any tree generated in any downstream step. 
    if not lookForFile(rank_file) or replace or update:

      out_file = open(rank_file, "w")
      print("\n".join([("%s\t%s") % (r[0], r[1]) for r in records]), file = \
        out_file)
      out_file.close()

      ## We could set the replace flag to True. However, if any tree has been
      ## generated 'de novo' during this iteration, then the flag is already set
      ## to True. 
      #~ parameters["replace"] = True

    ## Select a given number of models for the next iteration - if any
    selected_models = [pair[0] for pair in records[:parameters["numb_models"]]]

    ## Remove the Codon Frequency model from potential new iterations
    if prog in ["codonphyml"] and add_model:
      selected_models = [m.replace("_"+ add_model, "") for m in selected_models
        if m.endswith(add_model)]

  final = datetime.datetime.now()
  date = final.strftime("%H:%M:%S %m/%d/%y")
  print(("###\n###\tSTEP\tPhylogenetic Tree Reconstruction\tEND\t"
    + "%s") % (date), file=logFile)
    
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTOTAL Time\tPhylogenetic Tree Reconstruction\t%s"
    + "\n###") % (total), file=logFile)
  ## We just close logfile and clean it up when it is a file
  if "verbose" in parameters and parameters["verbose"] == 1:
    logFile.close()

    ## Clean-up log directory from undesirable lines
    try:
      sp.call(("sed -i '/^$/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/^M/d' %s.log") % (oFile), shell = True)
      sp.call(("sed -i '/\r/d' %s.log") % (oFile), shell = True)
    except OSError:
      print(("ERROR: Impossible to clean-up '%s.log' log file") \
        % (oFile), file=sys.stderr)

  ## Before returning to the main program, get back to the original working
  ## directory
  os.chdir(current_directory)

  return parameters
예제 #13
0
def perform_tree(label, binary, parameters, in_file, out_file, stats_file, \
  logFile, replace):

  '''
  Function to format the command-line of different phylogenetic tree reconstruc-
  tion programs and execute such command lines.
  '''

  ## Check whether the output file already exists. If it is not set to replace
  ## it, just return to the calling function
  if lookForFile(out_file) and not replace:
    return False

  if label in ["phyml", "codonphyml"]:
    cmd = ("%s -i %s %s") % (binary, in_file, parameters)

  elif label in ["fasttree"]:
    cmd = ("%s %s -log %s -out %s %s") % (binary, parameters, stats_file, \
      out_file, in_file)

  elif label in ["raxml"]:
    random_seed = randint(1, 10000)
    suffix = ("%s_%d") % (label, random_seed)

    cmd = ("%s -n %s -p %d -s %s %s") % (binary, suffix, random_seed, in_file, \
      parameters)

  else:
    sys.exit(exit_codes["generic"])

  ## Record the time and precise command-line
  name = getfqdn()
  start = datetime.datetime.now()
  date = start.strftime("%H:%M:%S %m/%d/%y")

  print(("###\n###\t%s - Phylogenetic Trees\t") % (label.upper()), end = ' ', \
    file = logFile)
  print(("%s\n###\t[%s]\tCommand-line\t%s\n###") % (date, name, cmd), file = \
    logFile)
  logFile.flush()

  try:
    ## We add a small pipeline to avoid informatin written in the same line
    proc = sp.Popen(cmd, shell = True, stderr = logFile, stdout = logFile,
      stdin = sp.PIPE)
  except OSError as e:
    print("ERROR: Execution failed: " + str(e), file=sys.stderr)
    sys.exit(exit_codes[label])

  proc.communicate(b'\n\nY\n')

  if proc.wait() != 0:
    print(("ERROR: Execution failed: %s") % (label.upper()), file = sys.stderr)
    sys.exit(exit_codes[label])

  final = datetime.datetime.now()
  ## We return a DELTA object comparing both timestamps
  total = format_time(final - start if start else 0)
  print(("###\tTime\t%s\n###") % (total), file=logFile)
  logFile.flush()

  ## Process program's output and rename output files according to our own
  ## scheme
  if label in ["phyml", "codonphyml"]:

    ## Since resulting tree/stats file have slightly changed between version,
    ## we have to control for that.
    tree_file = ("%s_%s_tree.txt") % (in_file, label)
    sts_file = ("%s_%s_stats.txt") % (in_file, label)
    if not lookForFile(tree_file, attempts = 2):
      tree_file = ("%s_%s_tree") % (in_file, label)   
      sts_file = ("%s_%s_stats") % (in_file, label)   

    try:
      sp.call(("mv %s %s") % (tree_file, out_file), shell = True)
      sp.call(("mv %s %s") % (sts_file, stats_file), shell = True)
    except OSError:
      print(("ERROR: Impossible to rename '%s' output files") \
        % (label.upper()), file=sys.stderr)
      sys.exit(exit_codes[label])

  elif label in ["raxml"]:
    try:
      sp.call(("mv RAxML_bestTree.%s %s") % (suffix, out_file), shell = True)
      sp.call(("mv RAxML_info.%s %s") % (suffix, stats_file), shell = True)
    except OSError:
      print(("ERROR: Impossible to rename RAxML output files"), file = \
        sys.stderr)
      sys.exit(exit_codes[label])

    oFile = open(stats_file, "a+")
    for oth_file in listDirectory(os.path.split(stats_file)[0], suffix):
      fileName = os.path.split(oth_file)[1]
      hz_line = "#" * (len(fileName) + 4)
      print(("%s\n%s\n%s") % (hz_line, fileName, hz_line), file = oFile)
      print(("%s") % ("".join(open(oth_file, "rU").readlines())), file = oFile)
      sp.call(("rm -f %s") % (oth_file), shell = True)
    oFile.close()

  return True