def mcl(inflation_list, dest, mcl_file="mcl", nm=None): print_col("Running mcl algorithm", GREEN, 1) mcl_input = join(dest, "backstage_files", "mclInput") mcl_output = join(dest, "backstage_files", "mclOutput_") for val in inflation_list: mcl_cmd = [mcl_file, mcl_input, "--abc", "-I", val, "-o", mcl_output + val.replace(".", "")] if nm: # The subprocess.Popen handler cannot be passed directly in Windows # due to pickling issues. So I pass the pid of the process instead. subp = subprocess.Popen(mcl_cmd) nm.subp = subp.pid subp.wait() nm.subp = None else: _ = subprocess.Popen(mcl_cmd).wait()
def mcl_groups(inflation_list, mcl_prefix, start_id, group_file, dest, nm=None): print_col("Dumping groups", GREEN, 1) # Create a results directory results_dir = join(dest, "Orthology_results") if not os.path.exists(results_dir): os.makedirs(results_dir) mcl_output = join(dest, "backstage_files", "mclOutput_") if nm: if nm.stop: raise KillByUser("") nm.total = len(inflation_list) nm.counter = 0 for val in inflation_list: if nm: if nm.stop: raise KillByUser("") nm.counter += 1 MclGroups.mcl_to_groups( mcl_prefix, start_id, mcl_output + val.replace(".", ""), os.path.join(results_dir, group_file + "_" + str(val) + ".txt"), nm=nm)
def filter_fasta(min_len, max_stop, db, dest, nm=None): print_col("Filtering proteome files", GREEN, 1) cp_dir = join(dest, "backstage_files", "compliantFasta") FilterFasta.orthomcl_filter_fasta(cp_dir, min_len, max_stop, db, dest, nm)
def allvsall_usearch(goodproteins, evalue, dest, cpus, usearch_outfile, usearch_bin="usearch", nm=None): print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1) # FNULL = open(os.devnull, "w") usearch_cmd = [ usearch_bin, "-ublast", join(dest, "backstage_files", goodproteins), "-db", join(dest, "backstage_files", goodproteins), "-blast6out", join(dest, "backstage_files", usearch_outfile), "-evalue", str(evalue), "--maxaccepts", "0", "-threads", str(cpus) ] if nm: # The subprocess.Popen handler cannot be passed directly in Windows # due to pickling issues. So I pass the pid of the process instead. subp = subprocess.Popen(usearch_cmd) nm.subp = subp.pid subp.wait() nm.subp = None else: _ = subprocess.Popen(usearch_cmd).wait()
def allvsall_usearch(goodproteins, evalue, dest, cpus, usearch_outfile, usearch_bin="usearch", nm=None): print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1) # FNULL = open(os.devnull, "w") usearch_cmd = [usearch_bin, "-ublast", join(dest, "backstage_files", goodproteins), "-db", join(dest, "backstage_files", goodproteins), "-blast6out", join(dest, "backstage_files", usearch_outfile), "-evalue", str(evalue), "--maxaccepts", "0", "-threads", str(cpus)] if nm: # The subprocess.Popen handler cannot be passed directly in Windows # due to pickling issues. So I pass the pid of the process instead. subp = subprocess.Popen(usearch_cmd) nm.subp = subp.pid subp.wait() nm.subp = None else: _ = subprocess.Popen(usearch_cmd).wait()
def adjust_fasta(file_list, dest, nm=None): print_col("Adjusting proteome files", GREEN, 1) # Create compliant fasta directory cf_dir = join(dest, "backstage_files", "compliantFasta") if not os.path.exists(cf_dir): os.makedirs(cf_dir) else: for f in os.listdir(cf_dir): os.remove(join(cf_dir, f)) # Setup progress information if nm: if nm.stop: KillByUser("") # Get total number of files for total progress nm.total = len(file_list) nm.counter = 0 for proteome in file_list: # Get code for proteome code_name = proteome.split(os.path.sep)[-1].split(".")[0] code_name = "_".join(code_name.split()) if nm: if nm.stop: raise KillByUser("") nm.counter += 1 nm.msg = "Adjusting file {}".format(basename(proteome)) # Check the unique ID field try: unique_id = check_unique_field(proteome, True, nm) except Exception as e: print_col("The file {} could not be parsed".format(proteome), YELLOW, 1) #TODO: Log errors on file continue # Adjust fasta # stg = prep_fasta(proteome, code_name, unique_id) prep_fasta(proteome, code_name, unique_id, dest, nm) protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \ ".fasta" protome_file_name = "_".join(protome_file_name.split()) pfile = basename(proteome.split(".")[0] + "_mod.fas") shutil.move(join(dest, "backstage_files", pfile), join(cf_dir, protome_file_name)) json_f = join(dest, "backstage_files", "header_mapping.json") header_f = join(dest, "backstage_files", "header_mapping.csv") if os.path.exists(json_f): with open(json_f) as fh, open(header_f, "w") as ofh: header_map = json.load(fh) for k, v in header_map.items(): ofh.write("{}; {}\n".format(k, v))
def main_check(): """ Performs sanity checks to argument combinations """ if arg.protein2dna and arg.infile: print_col("Group file operations are ignored when specifying " "conversion options.", YELLOW, 3) # Check if protein and cds data bases are provided when required if arg.groups2fasta and not arg.protein_db: print_col("A protein database must be provided to convert group " "files into sequence files using the --protein-db option. " "Exiting.", RED, 3) if arg.protein2dna and (not arg.protein_db and not arg.dna_db): print_col("A CDS data base and protein sequence files must be " "provided to convert protein sequences into nucleotide " "sequences using the --cds-db and --protein-db options, " "respectively. Exiting.", RED, 3) # Print warnings when trying to execute options that are not available to # multiple input group files if len(arg.infile) > 1: if arg.groups2fasta or arg.protein2dna: print_col("Conversion options are only available for single " "group files input.", YELLOW, 3) if arg.groups2fasta and not arg.gn_threshold and not arg.sp_threshold: print_col("No filters have been specified for the conversion of " "group files into protein sequences. This may result in a " "very large number of output files.", YELLOW, 3)
def blast_parser(usearch_ouput, dest, db_dir, nm): print_col("Parsing BLAST output", GREEN, 1) BlastParser.orthomcl_blast_parser( join(dest, "backstage_files", usearch_ouput), join(dest, "backstage_files", "compliantFasta"), db_dir, nm)
def dump_pairs(db_dir, dest, nm=None): print_col( "Dump files from the database produced by the orthomclPairs " "program", GREEN, 1) dump_pairs_sqlite.execute(db_dir, dest, nm=nm)
def install_schema(db_dir): """ Install the schema for the mySQL database :param db_dir: string, directory for the sqlite database """ print_col("Creating sqlite database", GREEN, 1) install_sqlite.execute(db_dir)
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None): if verbose: print_col("\t Preparing file for USEARCH", GREEN, 1) # Storing header list to check for duplicates header_list = [] # Get json with header mappings, if exists json_f = join(dest, "backstage_files", "header_mapping.json") if os.path.exists(json_f): with open(json_f) as fh: header_mapping = json.load(fh) else: header_mapping = {} # Will prevent writing lock = True # File handles file_in = open(proteome_file) pfile = basename(proteome_file.split(".")[0] + "_mod.fas") file_out_path = join(dest, "backstage_files", pfile) file_out = open(file_out_path, "w") for line in file_in: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): if line not in header_list: fields = line.split("|") unique_str = fields[unique_id].replace(" ", "_") header_mapping["%s|%s" % (code, unique_str)] = line.strip() header_list.append(line) file_out.write(">%s|%s\n" % (code, unique_str)) lock = True else: lock = False elif lock: file_out.write(line) # Close file handles: file_in.close() file_out.close() with open(json_f, "w") as fh: json.dump(header_mapping, fh)
def check_unique_field(proteome_file, verbose=False, nm=None): """ Checks the original proteome file for a field in the fasta header that is unique to all sequences """ # Some files may have utf8 encoding problems so I used codecs here file_handle = codecs.open(proteome_file, "r", "cp1252") header_list = [] header = "" for line in file_handle: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): header = line[1:].strip() # Store header in list format header_list.append(header.split("|")) # Get the size of the header fields header_field_size = len(header.split("|")) for i in range(header_field_size): if nm: if nm.stop: raise KillByUser("") temp_list = [] for header in header_list: temp_list.append(header[i]) if len(temp_list) == len(set(temp_list)) and len(set(temp_list)) ==\ len(header_list): # The orthoMCL program uses an index starting from 1, so the +1 is # a necessary adjustment if verbose: print_col("\t Using unique header field {}".format(i), GREEN, 1) return i # Ideally, a unique field should be found before this code. If not, raise # exception raise NoUniqueField("The proteome file {} has no unique field".format( os.path.basename(proteome_file)))
def check_bin_path(bin_path, program): prog = {"usearch": "usearch", "mcl": b"mcl"} try: res, _ = subprocess.Popen([bin_path, "--version"], stdout=subprocess.PIPE).communicate() if not res.startswith(prog[program]): print_col( "The {} executable file could not be found".format(program), RED, 1) except OSError: print_col("The {} executable file could not be found".format(program), RED, 1)
def check_bin_path(bin_path, program): prog = {"usearch": "usearch", "mcl": b"mcl"} try: res, _ = subprocess.Popen([bin_path, "--version"], stdout=subprocess.PIPE).communicate() if not res.startswith(prog[program]): print_col("The {} executable file could not be found".format( program), RED, 1) except OSError: print_col("The {} executable file could not be found".format( program), RED, 1)
def export_filtered_groups(inflation_list, group_prefix, gene_t, sp_t, sqldb, db, tmp_dir, dest, nm=None): print_col("Exporting filtered groups to protein sequence files", GREEN, 1) stats_storage = {} groups_obj = OT.MultiGroupsLight(tmp_dir) if nm: if nm.stop: raise KillByUser("") for val in inflation_list: # Create a directory that will store the results for the current # inflation value inflation_dir = join(dest, "Orthology_results", "Inflation%s" % val) if not os.path.exists(inflation_dir): os.makedirs(inflation_dir) group_file = join(dest, "Orthology_results", group_prefix + "_%s.txt" % val) # Create Group object group_obj = OT.GroupLight(group_file, gene_t, sp_t) # Add group to the MultiGroups object groups_obj.add_group(group_obj) # Export filtered groups and return stats to present in the app stats = group_obj.basic_group_statistics() # Retrieve fasta sequences from the filtered groups group_obj.retrieve_sequences(sqldb, db, dest=join(inflation_dir, "Orthologs"), shared_namespace=nm) # os.remove(sqldb) stats_storage[val] = stats return stats_storage, groups_obj
def adjust_fasta(file_list, dest, nm=None): print_col("Adjusting proteome files", GREEN, 1) # Create compliant fasta directory cf_dir = join(dest, "backstage_files", "compliantFasta") if not os.path.exists(cf_dir): os.makedirs(cf_dir) else: for f in os.listdir(cf_dir): os.remove(join(cf_dir, f)) # Setup progress information if nm: if nm.stop: KillByUser("") # Get total number of files for total progress nm.total = len(file_list) nm.counter = 0 for proteome in file_list: # Get code for proteome code_name = proteome.split(os.path.sep)[-1].split(".")[0] if nm: if nm.stop: raise KillByUser("") nm.counter += 1 nm.msg = "Adjusting file {}".format(basename(proteome)) # Check the unique ID field unique_id = check_unique_field(proteome, True, nm) # Adjust fasta # stg = prep_fasta(proteome, code_name, unique_id) prep_fasta(proteome, code_name, unique_id, nm) protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \ ".fasta" shutil.move( proteome.split(".")[0] + "_mod.fas", join(cf_dir, protome_file_name))
def prep_fasta(proteome_file, code, unique_id, verbose=False, nm=None): if verbose: print_col("\t Preparing file for USEARCH", GREEN, 1) # Storing header list to check for duplicates header_list = [] # Storing dictionary with header and sequence for later use seq_storage = {} # Will prevent writing lock = True # File handles file_in = open(proteome_file) file_out = open(proteome_file.split(".")[0] + "_mod.fas", "w") for line in file_in: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): if line not in header_list: fields = line.split("|") unique_str = fields[unique_id].replace(" ", "_") seq_storage["%s|%s" % (code, unique_str)] = "" header_list.append(line) file_out.write(">%s|%s\n" % (code, unique_str)) lock = True else: lock = False elif lock: seq_storage["%s|%s" % (code, unique_str)] += line.strip() file_out.write(line) # Close file handles: file_in.close() file_out.close() return seq_storage
def post_aln_checks(arg, aln_obj): if arg.consensus == ["IUPAC"] and "Protein" in aln_obj.sequence_code: print_col("'IUPAC' option of the consensus operation can " "only be performed on nucleotide alignments.", RED) if arg.codon_filter and "Protein" in aln_obj.sequence_code: print_col("The codon filter option (--codon-filter) can only be" " performed on nucleotide alignments.", RED) if len(aln_obj.sequence_code) > 1 and \ [x for x in arg.output_format if x in ["gphocs", "ima2", "snapp"]]: l = [x for x in arg.output_format if x in ["gphocs", "ima2", "snapp"]] print_col("The following selected output formats can only be used" " with nucleotide sequences: {}".format(", ".join(l)), RED) if aln_obj.bad_alignments: print_col("The following input files could not be read or are empty" ": {}".format(" ".join(aln_obj.bad_alignments)), YELLOW) if aln_obj.non_alignments: print_col("The following input files have alignments of unequal " "length: {}".format(" ".join(aln_obj.non_alignments)), YELLOW) else: return 0
def main_check(): """ Performs sanity checks to argument combinations """ if arg.protein2dna and arg.infile: print_col( "Group file operations are ignored when specifying " "conversion options.", YELLOW, 3) # Check if protein and cds data bases are provided when required if arg.groups2fasta and not arg.protein_db: print_col( "A protein database must be provided to convert group " "files into sequence files using the --protein-db option. " "Exiting.", RED, 3) if arg.protein2dna and (not arg.protein_db and not arg.dna_db): print_col( "A CDS data base and protein sequence files must be " "provided to convert protein sequences into nucleotide " "sequences using the --cds-db and --protein-db options, " "respectively. Exiting.", RED, 3) # Print warnings when trying to execute options that are not available to # multiple input group files if len(arg.infile) > 1: if arg.groups2fasta or arg.protein2dna: print_col( "Conversion options are only available for single " "group files input.", YELLOW, 3) if arg.groups2fasta and not arg.gn_threshold and not arg.sp_threshold: print_col( "No filters have been specified for the conversion of " "group files into protein sequences. This may result in a " "very large number of output files.", YELLOW, 3)
def post_aln_checks(arg, aln_obj): if arg.consensus == ["IUPAC"] and aln_obj.sequence_code[0] != "DNA": print_col("'IUPAC' option of the consensus operation can " "only be performed on nucleotide alignments.", RED) if arg.codon_filter and aln_obj.sequence_code[0] != "DNA": print_col("The codon filter option (--codon-filter) can only be" " performed on nucleotide alignments.", RED) if aln_obj.bad_alignments: print_col("The following input files could not be read or are empty" ": {}".format(" ".join(aln_obj.bad_alignments)), YELLOW) if aln_obj.non_alignments: print_col("The following input files have alignments of unequal " "length: {}".format(" ".join(aln_obj.non_alignments)), YELLOW) else: return 0
def main(): print_col( "Executing TriOrtho module at %s %s" % (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 3) # Create tmp dir os.makedirs(".tmp") # Arguments groups_file = arg.infile output_dir = arg.output_dir # Create output directory if not os.path.exists(output_dir): os.makedirs(output_dir) if arg.protein2dna: print_col("Converting protein sequences into nucleotide sequences", GREEN, 3) # Create database print_col("Creating database", GREEN, 3) id_db = protein2dna.create_db(arg.dna_db, ".tmp") # Create query for USEARCH print_col("Creating query", GREEN, 3) query_db = protein2dna.create_query(arg.protein_db, ".tmp") # Execute search print_col("Executing search", GREEN, 3) protein2dna.pair_search(".tmp") pair_db = protein2dna.get_pairs(".tmp") # Convert files print_col("Converting files", GREEN, 3) protein2dna.convert_protein_file(pair_db, query_db, id_db, output_dir) return print_col("Protein to nucleotide conversion complete", GREEN, 3) gene_threshold = arg.gn_threshold species_threshold = arg.sp_threshold protein_db = arg.protein_db if len(groups_file) == 1: print_col("Parsing group file", GREEN, 3) group_file = groups_file[0] group_object = OT.GroupLight(group_file, gene_threshold, species_threshold) # Check for plotting options if arg.plots: plt_methods = { "2": [ group_object.bar_species_distribution, "Species distribution" ], "3": [group_object.bar_species_coverage, "Species data coverage"], "4": [ group_object.bar_genecopy_per_species, "Gene copies per species" ], "5": [ group_object.bar_genecopy_distribution, "Gene copy distribution" ] } for i in arg.plots: if i == "1": print_col( "Plotting option 1 requires multiple group " "files. Skipping.", YELLOW, 3) continue # Generate plot data and file print_col("Generating plot for %s" % plt_methods[i][1], GREEN, 3) plot_obj, _, table = plt_methods[i][0](dest=output_dir) # Export filtered group file if arg.export: print_col( "Exporting filtered group file using %s maximum gene " "copies and %s minimum taxa representation" % (gene_threshold, species_threshold), GREEN, 3) group_object.export_filtered_group(dest=output_dir) print_col( "Filtering complete.\nTotal orthologs: %s;\nAfter gene " "filter: %s;\nAfter species filter: %s;\nAfter both " "filters: %s" % (len(group_object.species_frequency), group_object.num_gene_compliant, group_object.num_species_compliant, group_object.all_compliant), GREEN, 3) if arg.groups2fasta: print_col("Exporting group file as protein sequence files", GREEN, 3) # Set sqlite file sqldb = join(".tmp", "group2protein.db") group_object.retrieve_sequences(sqldb, protein_db, output_dir) else: print_col("Parsing %s group files" % len(groups_file), GREEN, 3) multiple_groups_object = OT.MultiGroupsLight(".tmp", groups_file, gene_threshold, species_threshold) if arg.plots: for i in arg.plots: if i != "1": print_col( "Plotting option %s requires a single group " "file as input. Skipping." % str(i), YELLOW, 3) continue print_col("Generating plot for Multiple group comparison", GREEN, 3) multiple_groups_object.update_filters(gene_threshold, species_threshold) multiple_groups_object.bar_orthologs(dest=output_dir) if arg.export: for gname, gobj in multiple_groups_object: gname = os.path.basename(gname) output_file = os.path.splitext(gname)[0] + "_filtered.txt" print_col( "Exporting group file %s using %s maximum gene " "copies and %s minimum taxa representation" % (gname, gene_threshold, species_threshold), GREEN, 3) gobj.export_filtered_group(output_file_name=output_file, dest=output_dir) print_col( "Filtering complete for group file %s.\nTotal " "orthologs: %s;\nAfter gene filter: %s;\nAfter " "species filter: %s;\nAfter both filters: %s" % (gname, len( gobj.species_frequency), gobj.num_gene_compliant, gobj.num_species_compliant, gobj.all_compliant), GREEN, 3)
def main(): # The inclusion of the argument definition in main, makes it possible to # import this file as a module and not triggering argparse. The # alternative of using a if __name__ == "__main__" statement does not # work well with the entry_points parameter of setup.py, since they call # the main function but do nothing inside said statement. parser = argparse.ArgumentParser(description="Command line interface for " "TriFusion Orthology search module") parser.add_argument("-in", dest="infile", type=str, required=True, help="Provide the path " "to the directory containing the proteome files") # Execution modes exec_modes = parser.add_argument_group("Execution modes") exec_modes.add_argument("-n", action="store_const", const=True, dest="normal", help="Complete run of the pipeline") exec_modes.add_argument("-a", action="store_const", const=True, dest="adjust", help="Only adjust proteome fasta files") exec_modes.add_argument("-na", action="store_const", const=True, dest="no_adjust", help="Complete run of the pipeline without " "adjusting fasta files") # Input formatting input_format = parser.add_argument_group("Input formatting") input_format.add_argument("-d", action="store_const", const=True, dest="code", help="Do not convert input proteome" " file names because the file names are already " "in code (e.g. Homo_sapiens.fas -> HoSap.fas") input_format.add_argument("-sep", dest="separator", help="Specify the " "separator in the input files (e.g. '_' is the" " separator in 'Homo_sapiens.fas'). This " "parameter is ignored if the '-d' option is set") # Search options search_opts = parser.add_argument_group("Ortholog search options") search_opts.add_argument("--usearch", dest="usearch_bin", default="usearch", help="Provide the path to the USEARCH executable." " If the executable is already in your " "PATH environment variable, specify only" " the name of the executable (default is " "'%(default)s')") search_opts.add_argument("--mcl", dest="mcl_bin", default="mcl", help="Provide the path to the MCL executable." " If the executable is already in your " "PATH environment variable, specify only" " the name of the executable (default is " "'%(default)s')") search_opts.add_argument( "--min-length", dest="min_length", type=int, default=10, help="Set minimum length allowed " "for protein sequences (default is '%(default)s')") search_opts.add_argument("--max-stop", dest="max_stop", type=int, default=20, help="Set maximum percentage of " "stop codons in protein sequences (default is " "'%(default)s')") search_opts.add_argument("--db", dest="database", default="goodProteins", help="Name of search " "database (default is '%(default)s')") search_opts.add_argument("--search-out", dest="search_out", default="AllVsAll.out", help="Name of the " "search output file containing the All-vs-All " "protein comparisons") search_opts.add_argument("-evalue", dest="evalue", default=1E-5, help="Set the e-value cut off for search " "operation (default is '%(default)s')") search_opts.add_argument("-inflation", dest="inflation", nargs="+", default=["3"], choices=[str(x) for x in xrange(1, 6)], help="Set inflation values for ortholog group" " clustering. Multiple values may be provided " "but values are limited to the range [1, 5]") # Output options output_opts = parser.add_argument_group("Output options") output_opts.add_argument("-o", dest="output_dir", default=os.getcwd(), help="Output directory") output_opts.add_argument("-prefix", dest="prefix", default="Ortholog", help="Set the prefix name for each ortholog " "cluster (default is '%(default)s')") output_opts.add_argument("-id", dest="id_num", type=int, default=1, help="Set the starting number for the ortholog " "clusters (default is '%(default)s')") output_opts.add_argument("--groups-file", dest="groups_file", default="groups", help="Set the name of the " "group files from the output of MCL (default is " "'%(default)s')") output_opts.add_argument("--min-species", dest="min_sp", default=1, type=float, help="Set the minimum number of " "species required for an ortholog cluster to be " "converted into protein sequence. This option " "will only affect the protein sequence files, " "not the group file output.") output_opts.add_argument("--max-gene-copy", dest="max_gn", default=100, type=int, help="Set the maximum number of gene " "copies from the same taxon for each ortholog " "cluster. This option will only affect the " "protein sequence files, not the group file " "output.") # Miscellaneous options misc_options = parser.add_argument_group("Miscellaneous options") misc_options.add_argument("-np", dest="cpus", default=1, help="Number of " "CPUs to be used during search operation (" "default is '%(default)s')") if len(sys.argv) == 1: parser.print_help() sys.exit(1) arg = parser.parse_args() # Crete temp directory tmp_dir = join(os.getcwd(), ".tmp") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) print_col( "Executing OrthoMCL pipeline at %s %s" % (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 1) try: start_time = time.time() # Arguments input_dir = arg.infile output_dir = arg.output_dir # name_separator = arg.separator min_length = arg.min_length max_percent_stop = arg.max_stop usearch_bin = arg.usearch_bin mcl_bin = arg.mcl_bin database_name = join(os.getcwd(), output_dir, "backstage_files", arg.database) usearch_out_name = arg.search_out evalue_cutoff = arg.evalue cpus = arg.cpus inflation = arg.inflation prefix = arg.prefix start_id = arg.id_num groups_file = arg.groups_file min_sp = arg.min_sp max_gn = arg.max_gn # Check USEARCH bin check_bin_path(usearch_bin, "usearch") # Check MCL bin check_bin_path(mcl_bin, "mcl") sql_path = join(tmp_dir, "sqldb.db") # Get proteome files if not os.path.exists(input_dir): print_col( "The input directory %s does not exist. Exiting." % input_dir, RED, 1) proteome_files = [ abspath(join(input_dir, x)) for x in os.listdir(input_dir) ] # Create and change working directory if not os.path.exists(output_dir): os.makedirs(output_dir) # os.chdir(output_dir) # Create directory that will store intermediate files during orthology # search int_dir = join(output_dir, "backstage_files") if not os.path.exists(int_dir): os.makedirs(int_dir) if arg.normal: install_schema(tmp_dir) adjust_fasta(proteome_files, output_dir) filter_fasta(min_length, max_percent_stop, database_name, output_dir) allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus, usearch_out_name, usearch_bin=usearch_bin) blast_parser(usearch_out_name, output_dir, tmp_dir, None) pairs(tmp_dir) dump_pairs(tmp_dir, output_dir) mcl(inflation, output_dir, mcl_file=mcl_bin) mcl_groups(inflation, prefix, start_id, groups_file, output_dir) export_filtered_groups(inflation, groups_file, max_gn, min_sp, sql_path, database_name, tmp_dir, output_dir) elif arg.adjust: adjust_fasta(proteome_files, output_dir) elif arg.no_adjust: install_schema(tmp_dir) filter_fasta(min_length, max_percent_stop, database_name, output_dir) allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus, usearch_out_name, usearch_bin=usearch_bin) blast_parser(usearch_out_name, output_dir, tmp_dir, None) pairs(tmp_dir) dump_pairs(tmp_dir, output_dir) mcl(inflation, output_dir, mcl_file=mcl_bin) mcl_groups(inflation, prefix, start_id, groups_file, output_dir) export_filtered_groups(inflation, groups_file, max_gn, min_sp, sql_path, database_name, tmp_dir, output_dir) print_col( "OrthoMCL pipeline execution successfully completed in %s " "seconds" % (round(time.time() - start_time, 2)), GREEN, 1) if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) except Exception as e: print(e.message) traceback.print_exc() if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) print_col("Program exited with errors!", RED, 1)
def pairs(db_dir, nm=None): print_col("Finding pairs for orthoMCL", GREEN, 1) make_pairs_sqlite.execute(db_dir, nm=nm)
def dump_pairs(db_dir, dest, nm=None): print_col("Dump files from the database produced by the orthomclPairs " "program", GREEN, 1) dump_pairs_sqlite.execute(db_dir, dest, nm=nm)
def check_dirs(dir_path): if not os.path.exists(dir_path): print_col("The following path does not exist: {}".format(dir_path), RED, 1)
def main(): # The inclusion of the argument definition in main, makes it possible to # import this file as a module and not triggering argparse. The # alternative of using a if __name__ == "__main__" statement does not # work well with the entry_points parameter of setup.py, since they call # the main function but do nothing inside said statement. parser = argparse.ArgumentParser(description="Command line interface for " "TriFusion Orthology search module") parser.add_argument("-in", dest="infile", type=str, help="Provide the path " "to the directory containing the proteome files") # Execution modes exec_modes = parser.add_argument_group("Execution modes") exec_modes.add_argument("-n", action="store_const", const=True, dest="normal", help="Complete run of the pipeline") exec_modes.add_argument("-a", action="store_const", const=True, dest="adjust", help="Only adjust proteome fasta files") exec_modes.add_argument("-na", action="store_const", const=True, dest="no_adjust", help="Complete run of the pipeline without " "adjusting fasta files") # Input formatting input_format = parser.add_argument_group("Input formatting") input_format.add_argument("-d", action="store_const", const=True, dest="code", help="Do not convert input proteome" " file names because the file names are already " "in code (e.g. Homo_sapiens.fas -> HoSap.fas") input_format.add_argument("-sep", dest="separator", help="Specify the " "separator in the input files (e.g. '_' is the" " separator in 'Homo_sapiens.fas'). This " "parameter is ignored if the '-d' option is set") # Search options search_opts = parser.add_argument_group("Ortholog search options") search_opts.add_argument("--usearch", dest="usearch_bin", default="usearch", help="Provide the path to the USEARCH executable." " If the executable is already in your " "PATH environment variable, specify only" " the name of the executable (default is " "'%(default)s')") search_opts.add_argument("--mcl", dest="mcl_bin", default="mcl", help="Provide the path to the MCL executable." " If the executable is already in your " "PATH environment variable, specify only" " the name of the executable (default is " "'%(default)s')") search_opts.add_argument("--min-length", dest="min_length", type=int, default=10, help="Set minimum length allowed " "for protein sequences (default is '%(default)s')") search_opts.add_argument("--max-stop", dest="max_stop", type=int, default=20, help="Set maximum percentage of " "stop codons in protein sequences (default is " "'%(default)s')") search_opts.add_argument("--db", dest="database", default="goodProteins", help="Name of search " "database (default is '%(default)s')") search_opts.add_argument("--search-out", dest="search_out", default="AllVsAll.out", help="Name of the " "search output file containing the All-vs-All " "protein comparisons") search_opts.add_argument("-evalue", dest="evalue", default=1E-5, help="Set the e-value cut off for search " "operation (default is '%(default)s')") search_opts.add_argument("-inflation", dest="inflation", nargs="+", default=["3"], choices=[str(x) for x in xrange(1, 6)], help="Set inflation values for ortholog group" " clustering. Multiple values may be provided " "but values are limited to the range [1, 5]") # Output options output_opts = parser.add_argument_group("Output options") output_opts.add_argument("-o", dest="output_dir", default=os.getcwd(), help="Output directory") output_opts.add_argument("-prefix", dest="prefix", default="Ortholog", help="Set the prefix name for each ortholog " "cluster (default is '%(default)s')") output_opts.add_argument("-id", dest="id_num", type=int, default=1, help="Set the starting number for the ortholog " "clusters (default is '%(default)s')") output_opts.add_argument("--groups-file", dest="groups_file", default="groups", help="Set the name of the " "group files from the output of MCL (default is " "'%(default)s')") output_opts.add_argument("--min-species", dest="min_sp", default=1, type=float, help="Set the minimum number of " "species required for an ortholog cluster to be " "converted into protein sequence. This option " "will only affect the protein sequence files, " "not the group file output.") output_opts.add_argument("--max-gene-copy", dest="max_gn", default=100, type=int, help="Set the maximum number of gene " "copies from the same taxon for each ortholog " "cluster. This option will only affect the " "protein sequence files, not the group file " "output.") # Miscellaneous options misc_options = parser.add_argument_group("Miscellaneous options") misc_options.add_argument("-np", dest="cpus", default=1, help="Number of " "CPUs to be used during search operation (" "default is '%(default)s')") misc_options.add_argument("-v", "--version", dest="version", action="store_const", const=True, help="Displays software version") if len(sys.argv) == 1: parser.print_help() sys.exit(1) arg = parser.parse_args() if arg.version: print(__version__) sys.exit(1) # Crete temp directory tmp_dir = join(os.getcwd(), ".tmp") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) print_col("Executing OrthoMCL pipeline at %s %s" % ( time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 1) try: start_time = time.time() # Arguments input_dir = os.path.abspath(arg.infile) check_dirs(input_dir) output_dir = os.path.abspath(arg.output_dir) # name_separator = arg.separator min_length = arg.min_length max_percent_stop = arg.max_stop usearch_bin = arg.usearch_bin mcl_bin = arg.mcl_bin database_name = join(os.getcwd(), output_dir, "backstage_files", arg.database) usearch_out_name = arg.search_out evalue_cutoff = arg.evalue cpus = arg.cpus inflation = arg.inflation prefix = arg.prefix start_id = arg.id_num groups_file = arg.groups_file min_sp = arg.min_sp max_gn = arg.max_gn # Check USEARCH bin check_bin_path(usearch_bin, "usearch") # Check MCL bin check_bin_path(mcl_bin, "mcl") sql_path = join(tmp_dir, "sqldb.db") # Get proteome files if not os.path.exists(input_dir): print_col("The input directory %s does not exist. Exiting." % input_dir, RED, 1) proteome_files = [abspath(join(input_dir, x)) for x in os.listdir( input_dir)] # Create and change working directory if not os.path.exists(output_dir): os.makedirs(output_dir) # os.chdir(output_dir) # Create directory that will store intermediate files during orthology # search int_dir = join(output_dir, "backstage_files") if not os.path.exists(int_dir): os.makedirs(int_dir) if arg.normal: install_schema(tmp_dir) adjust_fasta(proteome_files, output_dir) filter_fasta(min_length, max_percent_stop, database_name, output_dir) allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus, usearch_out_name, usearch_bin=usearch_bin) blast_parser(usearch_out_name, output_dir, tmp_dir, None) pairs(tmp_dir) dump_pairs(tmp_dir, output_dir) mcl(inflation, output_dir, mcl_file=mcl_bin) mcl_groups(inflation, prefix, start_id, groups_file, output_dir) export_filtered_groups(inflation, groups_file, max_gn, min_sp, sql_path, database_name, tmp_dir, output_dir) elif arg.adjust: adjust_fasta(proteome_files, output_dir) elif arg.no_adjust: install_schema(tmp_dir) filter_fasta(min_length, max_percent_stop, database_name, output_dir) allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus, usearch_out_name, usearch_bin=usearch_bin) blast_parser(usearch_out_name, output_dir, tmp_dir, None) pairs(tmp_dir) dump_pairs(tmp_dir, output_dir) mcl(inflation, output_dir, mcl_file=mcl_bin) mcl_groups(inflation, prefix, start_id, groups_file, output_dir) export_filtered_groups(inflation, groups_file, max_gn, min_sp, sql_path, database_name, tmp_dir, output_dir) print_col("OrthoMCL pipeline execution successfully completed in %s " "seconds" % (round(time.time() - start_time, 2)), GREEN, 1) if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) except Exception as e: print(e.message) traceback.print_exc() if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) print_col("Program exited with errors!", RED, 1)
def main_checks(arg): if not arg.infile and not arg.generate_cfg: print_col("Must provide input data using the '-in' option", RED, 2)
def stats_main(args): print_col("Executing TriStats module at %s %s" % ( time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 2) if args.generate_cfg: print_col("Generating configuration template file", GREEN, 2) return generate_cfg_template() # Create temporary directory tmp_dir = ".trifusion-temp" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Set path to temporary sqlite database sql_db = os.path.join(tmp_dir, "trifusion.db") # Arguments input_files = args.infile output_dir = args.project_name config_file = args.config_file # Read configuration file print_col("Reading configuration file", GREEN, 2) settings = configparser.ConfigParser() settings.read(config_file) # Parse alignments # Support wildcards as arguments for windows fl = [] if sys.platform in ["win32", "cygwin"]: for p in input_files: fl += glob(p) input_files = fl print_col("Parsing %s alignments" % len(input_files), GREEN, 2) alignments = AlignmentList(input_files, sql_db=sql_db) # Create output dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Variable mapping each available option with the appropriate statistics # and plotting methods func_map = { ("general information", "distribution_sequence_size", "species"): [alignments.average_seqsize_per_species, (box_plot, "avg_seqsize_species.png")], ("general information", "distribution_sequence_size", "average"): [alignments.average_seqsize, (histogram_plot, "avg_seqsize.png")], ("general information", "proportion_nucleotides_residues", "species"): [alignments.characters_proportion_per_species, (stacked_bar_plot, "char_proportions_sp.png")], ("general information", "proportion_nucleotides_residues", "average"): [alignments.characters_proportion, (bar_plot, "char_proportions.png")], ("general information", "distribution_taxa_frequency", "average"): [alignments.taxa_distribution, (histogram_plot, "distribution_taxa_frequency.png")], ("polymorphism and variation", "sequence_similarity", "species"): [alignments.sequence_similarity_per_species, (triangular_heat, "similarity_distribution_sp.png")], ("polymorphism and variation", "sequence_similarity", "average"): [alignments.sequence_similarity, (histogram_plot, "similarity_distribution.png")], ("polymorphism and variation", "sequence_similarity", "gene"): [alignments.sequence_similarity_gene, (sliding_window, "similarity_distribution_gn.png")], ("polymorphism and variation", "segregating_sites", "species"): [alignments.sequence_segregation_per_species, (triangular_heat, "segregating_sites_sp.png")], ("polymorphism and variation", "segregating_sites", "average"): [alignments.sequence_segregation, (histogram_plot, "segregating_sites.png")], ("polymorphism and variation", "segregating_sites", "gene"): [alignments.sequence_segregation_gene, (sliding_window, "segregating_sites_gn.png")], ("polymorphism and variation", "alignment_pol_correlation", "average"): [alignments.length_polymorphism_correlation, (scatter_plot, "length_polymorphism_correlation.png")], ("polymorphism and variation", "allele_frequency_spectrum", "average"): [alignments.allele_frequency_spectrum, (histogram_plot, "allele_frequency_spectrum.png")], ("polymorphism and variation", "allele_frequency_spectrum", "gene"): [alignments.allele_frequency_spectrum_gene, (histogram_plot, "allele_frequency_spectrum_gn.png")], ("missing data", "gene_occupancy", "average"): [alignments.gene_occupancy, (interpolation_plot, "gene_occupancy.png")], ("missing data", "distribution_missing_genes", "species"): [alignments.missing_genes_per_species, (bar_plot, "missing_gene_distribution.png")], ("missing data", "distribution_missing_genes", "average"): [alignments.missing_genes_average, (histogram_plot, "missing_gene_distribution_avg.png")], ("missing data", "distribution_missing_data", "species"): [alignments.missing_data_per_species, (stacked_bar_plot, "missing_data_distribution_sp.png")], ("missing data", "distribution_missing_data", "average"): [alignments.missing_data_distribution, (histogram_smooth, "missing_data_distribution.png")], ("missing data", "cumulative_distribution_missing_genes", "average"): [alignments.cumulative_missing_genes, (bar_plot, "cumulative_distribution_missing_genes.png")], ("outlier detection", "missing_data_outliers", "species"): [alignments.outlier_missing_data_sp, (outlier_densisty_dist, "Missing_data_outliers_sp.png")], ("outlier detection", "missing_data_outliers", "average"): [alignments.outlier_missing_data, (outlier_densisty_dist, "Missing_data_outliers.png")], ("outlier detection", "segregating_sites_outliers", "species"): [alignments.outlier_segregating_sp, (outlier_densisty_dist, "Segregating_sites_outliers_sp.png")], ("outlier detection", "segregating_sites_outliers", "average"): [alignments.outlier_segregating, (outlier_densisty_dist, "Segregating_sites_outliers.png")], ("outlier detection", "sequence_size_outliers", "species"): [alignments.outlier_sequence_size_sp, (outlier_densisty_dist, "Sequence_size_outliers_sp.png")], ("outlier detection", "sequence_size_outliers", "average"): [alignments.outlier_sequence_size, (outlier_densisty_dist, "Sequence_size_outliers.png")] } print_col("Parsing configuation file options", GREEN, 2) # Iterate over each individual option for section in settings.sections(): for option, val in settings.items(section): for i in val.split(): section = section.lower() # Check if current option is available or supported if (section, option, i) in func_map: print_col("Generating plot for option: %s - %s - %s" % (section, option, i), GREEN, 2) # Get appropriate method list funcs = func_map[(section, option, i)] # Retrieve plot data using statistics method plot_data = funcs[0]() # Check for exceptions in plot data if "exception" in plot_data: if plot_data["exception"] is EmptyData: print_col("Option %s - %s - %s has no data for " "plotting" % (section, option, i), YELLOW, 2) if plot_data["exception"] is InvalidSequenceType: print_col("Invalid sequence type for option %s - " "%s - %s (%s)" % (section, option, i, alignments.sequence_code[0]), YELLOW, 2) continue # Generate plot object plot_obj, _, lgd = funcs[1][0](**plot_data) plot_obj.tight_layout() # Save plot to file, including the legend object, if # available if lgd: plot_obj.savefig(join(output_dir, funcs[1][1]), bbox_extra_artists=(lgd,), dpi=200) else: plot_obj.savefig(join(output_dir, funcs[1][1]), dpi=200) else: print_col("Invalid option: %s - %s - %s. Skipping." % (section, option, i), YELLOW, 2)
def main(): print_col("Executing TriOrtho module at %s %s" % ( time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 3) # Create tmp dir os.makedirs(".tmp") # Arguments groups_file = arg.infile output_dir = arg.output_dir # Create output directory if not os.path.exists(output_dir): os.makedirs(output_dir) if arg.protein2dna: print_col("Converting protein sequences into nucleotide sequences", GREEN, 3) # Create database print_col("Creating database", GREEN, 3) id_db = protein2dna.create_db(arg.dna_db, ".tmp") # Create query for USEARCH print_col("Creating query", GREEN, 3) query_db = protein2dna.create_query(arg.protein_db, ".tmp") # Execute search print_col("Executing search", GREEN, 3) protein2dna.pair_search(".tmp") pair_db = protein2dna.get_pairs(".tmp") # Convert files print_col("Converting files", GREEN, 3) protein2dna.convert_protein_file(pair_db, query_db, id_db, output_dir) return print_col("Protein to nucleotide conversion complete", GREEN, 3) gene_threshold = arg.gn_threshold species_threshold = arg.sp_threshold protein_db = arg.protein_db if len(groups_file) == 1: print_col("Parsing group file", GREEN, 3) group_file = groups_file[0] group_object = OT.GroupLight(group_file, gene_threshold, species_threshold) # Check for plotting options if arg.plots: plt_methods = {"2": [group_object.bar_species_distribution, "Species distribution"], "3": [group_object.bar_species_coverage, "Species data coverage"], "4": [group_object.bar_genecopy_per_species, "Gene copies per species"], "5": [group_object.bar_genecopy_distribution, "Gene copy distribution"]} for i in arg.plots: if i == "1": print_col("Plotting option 1 requires multiple group " "files. Skipping.", YELLOW, 3) continue # Generate plot data and file print_col("Generating plot for %s" % plt_methods[i][1], GREEN, 3) plot_obj, _, table = plt_methods[i][0](dest=output_dir) # Export filtered group file if arg.export: print_col("Exporting filtered group file using %s maximum gene " "copies and %s minimum taxa representation" % (gene_threshold, species_threshold), GREEN, 3) group_object.export_filtered_group(dest=output_dir) print_col("Filtering complete.\nTotal orthologs: %s;\nAfter gene " "filter: %s;\nAfter species filter: %s;\nAfter both " "filters: %s" % (len(group_object.species_frequency), group_object.num_gene_compliant, group_object.num_species_compliant, group_object.all_compliant), GREEN, 3) if arg.groups2fasta: print_col("Exporting group file as protein sequence files", GREEN, 3) # Set sqlite file sqldb = join(".tmp", "group2protein.db") group_object.retrieve_sequences(sqldb, protein_db, output_dir) else: print_col("Parsing %s group files" % len(groups_file), GREEN, 3) multiple_groups_object = OT.MultiGroupsLight(".tmp", groups_file, gene_threshold, species_threshold) if arg.plots: for i in arg.plots: if i != "1": print_col("Plotting option %s requires a single group " "file as input. Skipping." % str(i), YELLOW, 3) continue print_col("Generating plot for Multiple group comparison", GREEN, 3) multiple_groups_object.update_filters(gene_threshold, species_threshold) multiple_groups_object.bar_orthologs(dest=output_dir) if arg.export: for gname, gobj in multiple_groups_object: gname = os.path.basename(gname) output_file = os.path.splitext(gname)[0] + "_filtered.txt" print_col("Exporting group file %s using %s maximum gene " "copies and %s minimum taxa representation" % (gname, gene_threshold, species_threshold), GREEN, 3) gobj.export_filtered_group(output_file_name=output_file, dest=output_dir) print_col("Filtering complete for group file %s.\nTotal " "orthologs: %s;\nAfter gene filter: %s;\nAfter " "species filter: %s;\nAfter both filters: %s" % (gname, len(gobj.species_frequency), gobj.num_gene_compliant, gobj.num_species_compliant, gobj.all_compliant), GREEN, 3)
def triseq_arg_check(arg): if arg.gcoder and "nexus" not in arg.output_format: print_col("Gap coding can only be performed for Nexus output format.", RED) if arg.gcoder and "nexus" in arg.output_format and \ arg.output_format != ["nexus"]: print_col("Gap coding can only be performed for Nexus output format." " This operation will be ignored for other output formats.", YELLOW, quiet=arg.quiet) if arg.conversion and arg.reverse: print_col("Ignoring conversion flag (-c) when specifying reverse" " concatenation (-r)", YELLOW, quiet=arg.quiet) if arg.outfile and arg.reverse: print_col("Ignoring output file option (-o) when specifying reverse" " concatenation (-r)", YELLOW, quiet=arg.quiet) if arg.partition_file is not None and arg.outfile is None: print_col("An output file must be provided with option '-o'", RED) if "ima2" in arg.output_format and arg.ima2_params is None: print_col("Additional arguments must be provided with the" " option --ima2-params when selecting ima2 output format", RED) if "ima2" in arg.output_format and len(arg.ima2_params) != 4: print_col("Four additional arguments must be provided with" " option --ima2-params when selecting the " "ima2 output format. %s were given" % (len(arg.ima2_params)), RED) if arg.partition_file is not None: return 0 if arg.conversion is None and arg.outfile is None and arg.reverse is None\ and arg.select is None and arg.get_taxa is False: print_col( "If you wish to concatenate provide the output file name using " "the '-o' option. If you wish to convert a " "file, specify it using the '-c' option", RED) if len(arg.infile) == 1 and arg.conversion is None and arg.reverse is None\ and arg.collapse is None: print_col( "Cannot perform concatenation of a single file. Please provide" " additional files to concatenate, or specify the conversion " "'-c' option", RED) if arg.zorro is not None and len(arg.infile) == 1: print_col( "The '-z' option cannot be invoked when only a single input " "file is provided. This option is reserved for" " concatenation of multiple alignment files", RED) if arg.consensus and arg.output_format != ["fasta"]: print_col("Output format must be only Fasta when using the " "consensus option", RED) if not arg.consensus and arg.consensus_single: print_col("Ignoring consensus single file option (--consensus-single-" "file) when the consensus operation is not specified", YELLOW, quiet=arg.quiet) else: return 0
def main_parser(arg, alignment_list): """ Function with the main operations of TriSeq """ print_col("Executing TriSeq module at %s %s" % (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, quiet=arg.quiet) # Create temp directory tmp_dir = ".trifusion-temp" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Set path to temporary sqlite database sql_db = os.path.join(tmp_dir, "trifusion.db") # If database already exists, erase it. Make sure we start fresh. if os.path.exists(sql_db): os.remove(sql_db) # Defining main variables conversion = arg.conversion output_format = arg.output_format outfile = arg.outfile interleave = arg.interleave model_phy = arg.model_phy # outgroup_taxa = arg.outgroup_taxa # Defining output file name if conversion is None and arg.outfile is not None: outfile = "".join(arg.outfile) elif conversion is None and arg.outfile is not None: outfile = "".join(arg.outfile) elif arg.consensus and arg.consensus_single and not arg.outfile: outfile = "consensus" # The input file at this stage is not necessary # If just converting the partition file format do this and exit if arg.partition_file is not None and not alignment_list: # Initializing Partitions instance and reading partitions file partition = data.Partitions() partition.read_from_file(arg.partition_file, no_aln_check=True) if partition.partition_format == "nexus": partition.write_to_file("raxml", outfile, model_phy) else: partition.write_to_file("nexus", outfile) return 0 # Support wildcars as arguments for windows fl = [] if sys.platform in ["win32", "cygwin"]: for p in alignment_list: fl += glob(p) alignment_list = fl # Check input files for directories alignment_list, dirs, lost = check_infile_list(alignment_list) if dirs: print_col( "Ignoring input files pointing to a directory: {}".format( " ".join(dirs)), YELLOW) if lost: print_col( "Ignoring input files that do not exist: {}".format( " ".join(lost)), YELLOW) if not alignment_list: print_col("No valid input files have been provided. Terminating...", RED) # Input alignments are mandatory from now on if not arg.quiet: pbar = ProgressBar(max_value=len(alignment_list), widgets=gen_wgt("")) else: pbar = None print_col("Parsing %s alignments" % len(alignment_list), GREEN, quiet=arg.quiet) alignments = seqset.AlignmentList(alignment_list, sql_db=sql_db, pbar=pbar) # If a partitions file was provided, and there is only a single input file, # try to associate the partitions. if len(alignment_list) == 1 and arg.partition_file: er = alignments.partitions.read_from_file(arg.partition_file) if er: print_col("Invalid partitions file.", RED) post_aln_checks(arg, alignments) # ################################ Utilities ############################## # Return a file with taxa list and exit if arg.get_taxa is True: print_col("Writing taxa to new file", GREEN, quiet=arg.quiet) alignments.write_taxa_to_file() return 0 # Remove taxa if arg.remove: print_col("Removing taxa", GREEN, quiet=arg.quiet) alignments.remove_taxa(arg.remove) # Grep taxa if arg.grep: print_col("Grepping taxa", GREEN, quiet=arg.quiet) alignments.remove_taxa(arg.grep, mode="inverse") # Select alignments if arg.select: print_col("Selecting alignments", GREEN, quiet=arg.quiet) if not os.path.exists("Taxa_selection"): os.makedirs("Taxa_selection") # Check if any of the provided taxa is absent from the alignments absent_taxa = [x for x in arg.select if x not in alignments.taxa_names] if absent_taxa: print_col("The following taxa were not found in any alignment and" " will be ignored: {}".format(" ".join(absent_taxa)), YELLOW, quiet=arg.quiet) selected_alignments = alignments.select_by_taxa(arg.select, mode="relaxed") for aln in selected_alignments: alignment_file = aln.path shutil.copy(alignment_file, "Taxa_selection") return # ############################# Main operations ########################### # Reverse concatenation if arg.reverse is not None: print_col("Reverse concatenating", GREEN, quiet=arg.quiet) if len(alignment_list) > 1: raise ArgumentError("Only one input file allowed for reverse " "concatenation") if arg.reverse: er = alignments.partitions.read_from_file(arg.reverse) if er: print_col("Invalid partitions file.", RED) alignments.reverse_concatenate(pbar=pbar) # Filtering # Filter by minimum taxa if arg.min_taxa: print_col("Filtering by minimum taxa", GREEN, quiet=arg.quiet) alignments.filter_min_taxa(arg.min_taxa, pbar=pbar) # Filter by alignments that contain taxa if arg.contain_filter: print_col("Filtering alignment(s) including a taxa group", GREEN, quiet=arg.quiet) alignments.filter_by_taxa(arg.contain_filter, "Contain", pbar=pbar) # Filter by alignments that exclude taxa if arg.exclude_filter: print_col("Filtering alignments excluding a taxa group", GREEN, quiet=arg.quiet) alignments.filter_by_taxa(arg.exclude_filter, "Exclude", pbar=pbar) # Filter by codon position if arg.codon_filter: print_col("Filtering by codon positions", GREEN, quiet=arg.quiet) if alignments.sequence_code[0] == "DNA": codon_settings = [ True if str(x) in arg.codon_filter else False for x in range(1, 4) ] alignments.filter_codon_positions(codon_settings, pbar=pbar) # Filter by missing data if arg.m_filter: print_col("Filtering by missing data", GREEN, quiet=arg.quiet) alignments.filter_missing_data(arg.m_filter[0], arg.m_filter[1], pbar=pbar, use_main_table=True) # Filtering by variable sites if arg.var_filter: print_col("Filtering by variable sites", GREEN, quiet=arg.quiet) alignments.filter_segregating_sites(arg.var_filter[0], arg.var_filter[1], pbar=pbar) # Filtering by informative sites if arg.inf_filter: print_col("Filtering by variable sites", GREEN, quiet=arg.quiet) alignments.filter_informative_sites(arg.inf_filter[0], arg.inf_filter[1], pbar=pbar) # Concatenation if not arg.conversion and not arg.consensus and len(alignment_list) > 1: print_col("Concatenating", GREEN, quiet=arg.quiet) alignments.concatenate(pbar=pbar) # Concatenate zorro files if arg.zorro: zorro = data.Zorro(alignment_list, arg.zorro) zorro.write_to_file(outfile) # Collapsing if arg.collapse: print_col("Collapsing", GREEN, quiet=arg.quiet) alignments.collapse(use_main_table=True, pbar=pbar, haplotypes_file=outfile) # Gcoder if arg.gcoder: print_col("Coding gaps", GREEN, quiet=arg.quiet) if output_format == ["nexus"]: alignments.code_gaps(use_main_table=True, pbar=pbar) # Consensus if arg.consensus: consensus_type = arg.consensus[0] print_col("Creating consensus sequences", GREEN, quiet=arg.quiet) alignments.consensus(consensus_type, single_file=arg.consensus_single, pbar=pbar) # Write output print_col("Writing output", GREEN, quiet=arg.quiet) alignments.write_to_file(output_format, output_file=outfile, output_suffix=arg.output_suffix, interleave=interleave, ima2_params=arg.ima2_params, partition_file=True, use_charset=True, pbar=pbar)
def triseq_arg_check(arg): if not arg.outfile and [x for x in arg.output_format if x in ["mcmctree", "gphocs", "ima2", "snapp"]]: invalid_formats = [x for x in arg.output_format if x in ["mcmctree", "gphocs", "ima2", "snapp"]] print_col("The following output formats can only be used with the" " concatenation operation (-o): {}".format(", ".join( invalid_formats)), RED) if arg.gcoder and "nexus" not in arg.output_format: print_col("Gap coding can only be performed for Nexus output format.", RED) if arg.gcoder and "nexus" in arg.output_format and \ arg.output_format != ["nexus"]: print_col("Gap coding can only be performed for Nexus output format." " This operation will be ignored for other output formats.", YELLOW, quiet=arg.quiet) if arg.conversion and arg.reverse: print_col("Ignoring conversion flag (-c) when specifying reverse" " concatenation (-r)", YELLOW, quiet=arg.quiet) if arg.partition_file is not None and arg.outfile is None: print_col("An output file must be provided with option '-o'", RED) if "ima2" in arg.output_format and arg.ima2_params is None: print_col("Additional arguments must be provided with the" " option --ima2-params when selecting ima2 output format", RED) if "ima2" in arg.output_format and len(arg.ima2_params) != 4: print_col("Four additional arguments must be provided with" " option --ima2-params when selecting the " "ima2 output format. %s were given" % (len(arg.ima2_params)), RED) if arg.partition_file is not None: return 0 if arg.conversion is None and arg.outfile is None and arg.reverse is None\ and arg.select is None and arg.get_taxa is False: print_col( "If you wish to concatenate, provide the output file name using " "the '-o' option. If you wish to convert a " "file, specify it using the '-c' option", RED) if arg.zorro is not None and len(arg.infile) == 1: print_col( "The '-z' option cannot be invoked when only a single input " "file is provided. This option is reserved for" " concatenation of multiple alignment files", RED) if arg.consensus and arg.output_format != ["fasta"]: print_col("Output format must be only Fasta when using the " "consensus option", RED) if not arg.consensus and arg.consensus_single: print_col("Ignoring consensus single file option (--consensus-single-" "file) when the consensus operation is not specified", YELLOW, quiet=arg.quiet) else: return 0