예제 #1
0
def mcl(inflation_list, dest, mcl_file="mcl", nm=None):

    print_col("Running mcl algorithm", GREEN, 1)
    mcl_input = join(dest, "backstage_files", "mclInput")
    mcl_output = join(dest, "backstage_files", "mclOutput_")

    for val in inflation_list:

        mcl_cmd = [mcl_file,
                   mcl_input,
                   "--abc",
                   "-I",
                   val,
                   "-o",
                   mcl_output + val.replace(".", "")]

        if nm:
            # The subprocess.Popen handler cannot be passed directly in Windows
            # due to pickling issues. So I pass the pid of the process instead.
            subp = subprocess.Popen(mcl_cmd)
            nm.subp = subp.pid
            subp.wait()
            nm.subp = None
        else:
            _ = subprocess.Popen(mcl_cmd).wait()
예제 #2
0
def mcl_groups(inflation_list, mcl_prefix, start_id, group_file, dest,
                nm=None):

    print_col("Dumping groups", GREEN, 1)

    # Create a results directory
    results_dir = join(dest, "Orthology_results")
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    mcl_output = join(dest, "backstage_files", "mclOutput_")

    if nm:
        if nm.stop:
            raise KillByUser("")
        nm.total = len(inflation_list)
        nm.counter = 0

    for val in inflation_list:

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1

        MclGroups.mcl_to_groups(
            mcl_prefix,
            start_id,
            mcl_output + val.replace(".", ""),
            os.path.join(results_dir, group_file + "_" + str(val) + ".txt"),
            nm=nm)
def filter_fasta(min_len, max_stop, db, dest, nm=None):

    print_col("Filtering proteome files", GREEN, 1)

    cp_dir = join(dest, "backstage_files", "compliantFasta")

    FilterFasta.orthomcl_filter_fasta(cp_dir, min_len, max_stop, db, dest, nm)
def allvsall_usearch(goodproteins,
                     evalue,
                     dest,
                     cpus,
                     usearch_outfile,
                     usearch_bin="usearch",
                     nm=None):

    print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1)

    # FNULL = open(os.devnull, "w")
    usearch_cmd = [
        usearch_bin, "-ublast",
        join(dest, "backstage_files", goodproteins), "-db",
        join(dest, "backstage_files", goodproteins), "-blast6out",
        join(dest, "backstage_files", usearch_outfile), "-evalue",
        str(evalue), "--maxaccepts", "0", "-threads",
        str(cpus)
    ]

    if nm:
        # The subprocess.Popen handler cannot be passed directly in Windows
        # due to pickling issues. So I pass the pid of the process instead.
        subp = subprocess.Popen(usearch_cmd)
        nm.subp = subp.pid
        subp.wait()
        nm.subp = None
    else:
        _ = subprocess.Popen(usearch_cmd).wait()
예제 #5
0
def allvsall_usearch(goodproteins, evalue, dest, cpus, usearch_outfile,
                     usearch_bin="usearch", nm=None):

    print_col("Perfoming USEARCH All-vs-All (may take a while...)", GREEN, 1)

    # FNULL = open(os.devnull, "w")
    usearch_cmd = [usearch_bin,
                   "-ublast",
                   join(dest, "backstage_files", goodproteins),
                   "-db",
                   join(dest, "backstage_files", goodproteins),
                   "-blast6out",
                   join(dest, "backstage_files", usearch_outfile),
                   "-evalue", str(evalue),
                   "--maxaccepts",
                   "0",
                   "-threads",
                   str(cpus)]

    if nm:
        # The subprocess.Popen handler cannot be passed directly in Windows
        # due to pickling issues. So I pass the pid of the process instead.
        subp = subprocess.Popen(usearch_cmd)
        nm.subp = subp.pid
        subp.wait()
        nm.subp = None
    else:
        _ = subprocess.Popen(usearch_cmd).wait()
예제 #6
0
def filter_fasta(min_len, max_stop, db, dest, nm=None):

    print_col("Filtering proteome files", GREEN, 1)

    cp_dir = join(dest, "backstage_files", "compliantFasta")

    FilterFasta.orthomcl_filter_fasta(cp_dir, min_len, max_stop, db, dest, nm)
예제 #7
0
def adjust_fasta(file_list, dest, nm=None):

    print_col("Adjusting proteome files", GREEN, 1)

    # Create compliant fasta directory
    cf_dir = join(dest, "backstage_files", "compliantFasta")
    if not os.path.exists(cf_dir):
        os.makedirs(cf_dir)
    else:
        for f in os.listdir(cf_dir):
            os.remove(join(cf_dir, f))

    # Setup progress information
    if nm:
        if nm.stop:
            KillByUser("")
        # Get total number of files for total progress
        nm.total = len(file_list)
        nm.counter = 0

    for proteome in file_list:
        # Get code for proteome
        code_name = proteome.split(os.path.sep)[-1].split(".")[0]
        code_name = "_".join(code_name.split())

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1
            nm.msg = "Adjusting file {}".format(basename(proteome))

        # Check the unique ID field
        try:
            unique_id = check_unique_field(proteome, True, nm)
        except Exception as e:
            print_col("The file {} could not be parsed".format(proteome),
                      YELLOW, 1)
            #TODO: Log errors on file
            continue

        # Adjust fasta
        # stg = prep_fasta(proteome, code_name, unique_id)
        prep_fasta(proteome, code_name, unique_id, dest, nm)

        protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \
                            ".fasta"
        protome_file_name = "_".join(protome_file_name.split())

        pfile = basename(proteome.split(".")[0] + "_mod.fas")
        shutil.move(join(dest, "backstage_files", pfile),
                    join(cf_dir, protome_file_name))

    json_f = join(dest, "backstage_files", "header_mapping.json")
    header_f = join(dest, "backstage_files", "header_mapping.csv")
    if os.path.exists(json_f):
        with open(json_f) as fh, open(header_f, "w") as ofh:
            header_map = json.load(fh)

            for k, v in header_map.items():
                ofh.write("{}; {}\n".format(k, v))
예제 #8
0
def main_check():
    """
    Performs sanity checks to argument combinations
    """

    if arg.protein2dna and arg.infile:
        print_col("Group file operations are ignored when specifying "
                  "conversion options.", YELLOW, 3)

    # Check if protein and cds data bases are provided when required
    if arg.groups2fasta and not arg.protein_db:
        print_col("A protein database must be provided to convert group "
                  "files into sequence files using the --protein-db option. "
                  "Exiting.", RED, 3)

    if arg.protein2dna and (not arg.protein_db and not arg.dna_db):
        print_col("A CDS data base and protein sequence files must be "
                  "provided to convert protein sequences into nucleotide "
                  "sequences using the --cds-db and --protein-db options, "
                  "respectively. Exiting.", RED, 3)

    # Print warnings when trying to execute options that are not available to
    # multiple input group files
    if len(arg.infile) > 1:
        if arg.groups2fasta or arg.protein2dna:
            print_col("Conversion options are only available for single "
                      "group files input.", YELLOW, 3)

    if arg.groups2fasta and not arg.gn_threshold and not arg.sp_threshold:
        print_col("No filters have been specified for the conversion of "
                  "group files into protein sequences. This may result in a "
                  "very large number of output files.", YELLOW, 3)
def blast_parser(usearch_ouput, dest, db_dir, nm):

    print_col("Parsing BLAST output", GREEN, 1)

    BlastParser.orthomcl_blast_parser(
        join(dest, "backstage_files", usearch_ouput),
        join(dest, "backstage_files", "compliantFasta"), db_dir, nm)
def dump_pairs(db_dir, dest, nm=None):

    print_col(
        "Dump files from the database produced by the orthomclPairs "
        "program", GREEN, 1)

    dump_pairs_sqlite.execute(db_dir, dest, nm=nm)
예제 #11
0
def blast_parser(usearch_ouput, dest, db_dir, nm):

    print_col("Parsing BLAST output", GREEN, 1)

    BlastParser.orthomcl_blast_parser(
        join(dest, "backstage_files", usearch_ouput),
        join(dest, "backstage_files", "compliantFasta"),
        db_dir,
        nm)
def install_schema(db_dir):
    """
    Install the schema for the mySQL database

    :param db_dir: string, directory for the sqlite database
    """

    print_col("Creating sqlite database", GREEN, 1)
    install_sqlite.execute(db_dir)
예제 #13
0
def install_schema(db_dir):
    """
    Install the schema for the mySQL database

    :param db_dir: string, directory for the sqlite database
    """

    print_col("Creating sqlite database", GREEN, 1)
    install_sqlite.execute(db_dir)
예제 #14
0
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None):

    if verbose:
        print_col("\t Preparing file for USEARCH", GREEN, 1)

    # Storing header list to check for duplicates
    header_list = []

    # Get json with header mappings, if exists
    json_f = join(dest, "backstage_files", "header_mapping.json")
    if os.path.exists(json_f):
        with open(json_f) as fh:
            header_mapping = json.load(fh)
    else:
        header_mapping = {}

    # Will prevent writing
    lock = True

    # File handles
    file_in = open(proteome_file)
    pfile = basename(proteome_file.split(".")[0] + "_mod.fas")
    file_out_path = join(dest, "backstage_files", pfile)
    file_out = open(file_out_path, "w")

    for line in file_in:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            if line not in header_list:
                fields = line.split("|")
                unique_str = fields[unique_id].replace(" ", "_")
                header_mapping["%s|%s" % (code, unique_str)] = line.strip()
                header_list.append(line)
                file_out.write(">%s|%s\n" % (code, unique_str))
                lock = True
            else:
                lock = False
        elif lock:
            file_out.write(line)

    # Close file handles:
    file_in.close()
    file_out.close()

    with open(json_f, "w") as fh:
        json.dump(header_mapping, fh)
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None):

    if verbose:
        print_col("\t Preparing file for USEARCH", GREEN, 1)

    # Storing header list to check for duplicates
    header_list = []

    # Get json with header mappings, if exists
    json_f = join(dest, "backstage_files", "header_mapping.json")
    if os.path.exists(json_f):
        with open(json_f) as fh:
            header_mapping = json.load(fh)
    else:
        header_mapping = {}

    # Will prevent writing
    lock = True

    # File handles
    file_in = open(proteome_file)
    pfile = basename(proteome_file.split(".")[0] + "_mod.fas")
    file_out_path = join(dest, "backstage_files", pfile)
    file_out = open(file_out_path, "w")

    for line in file_in:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            if line not in header_list:
                fields = line.split("|")
                unique_str = fields[unique_id].replace(" ", "_")
                header_mapping["%s|%s" % (code, unique_str)] = line.strip()
                header_list.append(line)
                file_out.write(">%s|%s\n" % (code, unique_str))
                lock = True
            else:
                lock = False
        elif lock:
            file_out.write(line)

    # Close file handles:
    file_in.close()
    file_out.close()

    with open(json_f, "w") as fh:
        json.dump(header_mapping, fh)
def check_unique_field(proteome_file, verbose=False, nm=None):
    """
    Checks the original proteome file for a field in the fasta header
    that is unique to all sequences
    """

    # Some files may have utf8 encoding problems so I used codecs here
    file_handle = codecs.open(proteome_file, "r", "cp1252")
    header_list = []

    header = ""
    for line in file_handle:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            header = line[1:].strip()
            # Store header in list format
            header_list.append(header.split("|"))

    # Get the size of the header fields
    header_field_size = len(header.split("|"))

    for i in range(header_field_size):

        if nm:
            if nm.stop:
                raise KillByUser("")

        temp_list = []
        for header in header_list:
            temp_list.append(header[i])

        if len(temp_list) == len(set(temp_list)) and len(set(temp_list)) ==\
                len(header_list):

            # The orthoMCL program uses an index starting from 1, so the +1 is
            #  a necessary adjustment
            if verbose:
                print_col("\t Using unique header field {}".format(i), GREEN,
                          1)
            return i

    # Ideally, a unique field should be found before this code. If not, raise
    #  exception
    raise NoUniqueField("The proteome file {} has no unique field".format(
        os.path.basename(proteome_file)))
def check_bin_path(bin_path, program):

    prog = {"usearch": "usearch", "mcl": b"mcl"}

    try:
        res, _ = subprocess.Popen([bin_path, "--version"],
                                  stdout=subprocess.PIPE).communicate()

        if not res.startswith(prog[program]):
            print_col(
                "The {} executable file could not be found".format(program),
                RED, 1)
    except OSError:
        print_col("The {} executable file could not be found".format(program),
                  RED, 1)
예제 #18
0
def check_bin_path(bin_path, program):

    prog = {"usearch": "usearch",
            "mcl": b"mcl"}

    try:
        res, _ = subprocess.Popen([bin_path, "--version"],
                                  stdout=subprocess.PIPE).communicate()

        if not res.startswith(prog[program]):
            print_col("The {} executable file could not be found".format(
                program), RED, 1)
    except OSError:
        print_col("The {} executable file could not be found".format(
            program), RED, 1)
예제 #19
0
def check_unique_field(proteome_file, verbose=False, nm=None):
    """
    Checks the original proteome file for a field in the fasta header
    that is unique to all sequences
    """

    # Some files may have utf8 encoding problems so I used codecs here
    file_handle = codecs.open(proteome_file, "r", "cp1252")
    header_list = []

    header = ""
    for line in file_handle:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            header = line[1:].strip()
            # Store header in list format
            header_list.append(header.split("|"))

    # Get the size of the header fields
    header_field_size = len(header.split("|"))

    for i in range(header_field_size):

        if nm:
            if nm.stop:
                raise KillByUser("")

        temp_list = []
        for header in header_list:
            temp_list.append(header[i])

        if len(temp_list) == len(set(temp_list)) and len(set(temp_list)) ==\
                len(header_list):

            # The orthoMCL program uses an index starting from 1, so the +1 is
            #  a necessary adjustment
            if verbose:
                print_col("\t Using unique header field {}".format(i), GREEN, 1)
            return i

    # Ideally, a unique field should be found before this code. If not, raise
    #  exception
    raise NoUniqueField("The proteome file {} has no unique field".format(
        os.path.basename(proteome_file)))
def export_filtered_groups(inflation_list,
                           group_prefix,
                           gene_t,
                           sp_t,
                           sqldb,
                           db,
                           tmp_dir,
                           dest,
                           nm=None):

    print_col("Exporting filtered groups to protein sequence files", GREEN, 1)

    stats_storage = {}
    groups_obj = OT.MultiGroupsLight(tmp_dir)

    if nm:
        if nm.stop:
            raise KillByUser("")

    for val in inflation_list:
        # Create a directory that will store the results for the current
        # inflation value
        inflation_dir = join(dest, "Orthology_results", "Inflation%s" % val)
        if not os.path.exists(inflation_dir):
            os.makedirs(inflation_dir)

        group_file = join(dest, "Orthology_results",
                          group_prefix + "_%s.txt" % val)

        # Create Group object
        group_obj = OT.GroupLight(group_file, gene_t, sp_t)
        # Add group to the MultiGroups object
        groups_obj.add_group(group_obj)
        # Export filtered groups and return stats to present in the app
        stats = group_obj.basic_group_statistics()
        # Retrieve fasta sequences from the filtered groups
        group_obj.retrieve_sequences(sqldb,
                                     db,
                                     dest=join(inflation_dir, "Orthologs"),
                                     shared_namespace=nm)
        # os.remove(sqldb)
        stats_storage[val] = stats

    return stats_storage, groups_obj
예제 #21
0
def adjust_fasta(file_list, dest, nm=None):

    print_col("Adjusting proteome files", GREEN, 1)

    # Create compliant fasta directory
    cf_dir = join(dest, "backstage_files", "compliantFasta")
    if not os.path.exists(cf_dir):
        os.makedirs(cf_dir)
    else:
        for f in os.listdir(cf_dir):
            os.remove(join(cf_dir, f))

    # Setup progress information
    if nm:
        if nm.stop:
            KillByUser("")
        # Get total number of files for total progress
        nm.total = len(file_list)
        nm.counter = 0

    for proteome in file_list:
        # Get code for proteome
        code_name = proteome.split(os.path.sep)[-1].split(".")[0]

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1
            nm.msg = "Adjusting file {}".format(basename(proteome))

        # Check the unique ID field
        unique_id = check_unique_field(proteome, True, nm)

        # Adjust fasta
        # stg = prep_fasta(proteome, code_name, unique_id)
        prep_fasta(proteome, code_name, unique_id, nm)

        protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \
                            ".fasta"

        shutil.move(
            proteome.split(".")[0] + "_mod.fas", join(cf_dir,
                                                      protome_file_name))
예제 #22
0
def prep_fasta(proteome_file, code, unique_id, verbose=False, nm=None):

    if verbose:
        print_col("\t Preparing file for USEARCH", GREEN, 1)

    # Storing header list to check for duplicates
    header_list = []

    # Storing dictionary with header and sequence for later use
    seq_storage = {}

    # Will prevent writing
    lock = True

    # File handles
    file_in = open(proteome_file)
    file_out = open(proteome_file.split(".")[0] + "_mod.fas", "w")

    for line in file_in:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            if line not in header_list:
                fields = line.split("|")
                unique_str = fields[unique_id].replace(" ", "_")
                seq_storage["%s|%s" % (code, unique_str)] = ""
                header_list.append(line)
                file_out.write(">%s|%s\n" % (code, unique_str))
                lock = True
            else:
                lock = False
        elif lock:
            seq_storage["%s|%s" % (code, unique_str)] += line.strip()
            file_out.write(line)

    # Close file handles:
    file_in.close()
    file_out.close()

    return seq_storage
예제 #23
0
def post_aln_checks(arg, aln_obj):

    if arg.consensus == ["IUPAC"] and "Protein" in aln_obj.sequence_code:
        print_col("'IUPAC' option of the consensus operation can "
                  "only be performed on nucleotide alignments.", RED)
    if arg.codon_filter and "Protein" in aln_obj.sequence_code:
        print_col("The codon filter option (--codon-filter) can only be"
                  " performed on nucleotide alignments.", RED)
    if len(aln_obj.sequence_code) > 1 and \
            [x for x in arg.output_format if x in ["gphocs", "ima2", "snapp"]]:
        l = [x for x in arg.output_format if x in ["gphocs", "ima2", "snapp"]]
        print_col("The following selected output formats can only be used"
                  " with nucleotide sequences: {}".format(", ".join(l)), RED)
    if aln_obj.bad_alignments:
        print_col("The following input files could not be read or are empty"
                  ": {}".format(" ".join(aln_obj.bad_alignments)), YELLOW)
    if aln_obj.non_alignments:
        print_col("The following input files have alignments of unequal "
                  "length: {}".format(" ".join(aln_obj.non_alignments)),
                  YELLOW)

    else:
        return 0
예제 #24
0
def main_check():
    """
    Performs sanity checks to argument combinations
    """

    if arg.protein2dna and arg.infile:
        print_col(
            "Group file operations are ignored when specifying "
            "conversion options.", YELLOW, 3)

    # Check if protein and cds data bases are provided when required
    if arg.groups2fasta and not arg.protein_db:
        print_col(
            "A protein database must be provided to convert group "
            "files into sequence files using the --protein-db option. "
            "Exiting.", RED, 3)

    if arg.protein2dna and (not arg.protein_db and not arg.dna_db):
        print_col(
            "A CDS data base and protein sequence files must be "
            "provided to convert protein sequences into nucleotide "
            "sequences using the --cds-db and --protein-db options, "
            "respectively. Exiting.", RED, 3)

    # Print warnings when trying to execute options that are not available to
    # multiple input group files
    if len(arg.infile) > 1:
        if arg.groups2fasta or arg.protein2dna:
            print_col(
                "Conversion options are only available for single "
                "group files input.", YELLOW, 3)

    if arg.groups2fasta and not arg.gn_threshold and not arg.sp_threshold:
        print_col(
            "No filters have been specified for the conversion of "
            "group files into protein sequences. This may result in a "
            "very large number of output files.", YELLOW, 3)
예제 #25
0
def export_filtered_groups(inflation_list, group_prefix, gene_t, sp_t, sqldb,
                           db, tmp_dir, dest, nm=None):

    print_col("Exporting filtered groups to protein sequence files", GREEN, 1)

    stats_storage = {}
    groups_obj = OT.MultiGroupsLight(tmp_dir)

    if nm:
        if nm.stop:
            raise KillByUser("")

    for val in inflation_list:
        # Create a directory that will store the results for the current
        # inflation value
        inflation_dir = join(dest, "Orthology_results", "Inflation%s" % val)
        if not os.path.exists(inflation_dir):
            os.makedirs(inflation_dir)

        group_file = join(dest, "Orthology_results",
                          group_prefix + "_%s.txt" % val)

        # Create Group object
        group_obj = OT.GroupLight(group_file, gene_t, sp_t)
        # Add group to the MultiGroups object
        groups_obj.add_group(group_obj)
        # Export filtered groups and return stats to present in the app
        stats = group_obj.basic_group_statistics()
        # Retrieve fasta sequences from the filtered groups
        group_obj.retrieve_sequences(sqldb, db,
                                     dest=join(inflation_dir, "Orthologs"),
                                     shared_namespace=nm)
        # os.remove(sqldb)
        stats_storage[val] = stats

    return stats_storage, groups_obj
예제 #26
0
def post_aln_checks(arg, aln_obj):

    if arg.consensus == ["IUPAC"] and aln_obj.sequence_code[0] != "DNA":
        print_col("'IUPAC' option of the consensus operation can "
                  "only be performed on nucleotide alignments.", RED)
    if arg.codon_filter and aln_obj.sequence_code[0] != "DNA":
        print_col("The codon filter option (--codon-filter) can only be"
                  " performed on nucleotide alignments.", RED)
    if aln_obj.bad_alignments:
        print_col("The following input files could not be read or are empty"
                  ": {}".format(" ".join(aln_obj.bad_alignments)), YELLOW)
    if aln_obj.non_alignments:
        print_col("The following input files have alignments of unequal "
                  "length: {}".format(" ".join(aln_obj.non_alignments)),
                  YELLOW)

    else:
        return 0
예제 #27
0
def main():

    print_col(
        "Executing TriOrtho module at %s %s" %
        (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 3)

    # Create tmp dir
    os.makedirs(".tmp")

    # Arguments
    groups_file = arg.infile
    output_dir = arg.output_dir

    # Create output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if arg.protein2dna:
        print_col("Converting protein sequences into nucleotide sequences",
                  GREEN, 3)
        # Create database
        print_col("Creating database", GREEN, 3)
        id_db = protein2dna.create_db(arg.dna_db, ".tmp")
        # Create query for USEARCH
        print_col("Creating query", GREEN, 3)
        query_db = protein2dna.create_query(arg.protein_db, ".tmp")
        # Execute search
        print_col("Executing search", GREEN, 3)
        protein2dna.pair_search(".tmp")
        pair_db = protein2dna.get_pairs(".tmp")
        # Convert files
        print_col("Converting files", GREEN, 3)
        protein2dna.convert_protein_file(pair_db, query_db, id_db, output_dir)
        return print_col("Protein to nucleotide conversion complete", GREEN, 3)

    gene_threshold = arg.gn_threshold
    species_threshold = arg.sp_threshold
    protein_db = arg.protein_db

    if len(groups_file) == 1:

        print_col("Parsing group file", GREEN, 3)
        group_file = groups_file[0]
        group_object = OT.GroupLight(group_file, gene_threshold,
                                     species_threshold)

        # Check for plotting options
        if arg.plots:

            plt_methods = {
                "2": [
                    group_object.bar_species_distribution,
                    "Species distribution"
                ],
                "3":
                [group_object.bar_species_coverage, "Species data coverage"],
                "4": [
                    group_object.bar_genecopy_per_species,
                    "Gene copies per species"
                ],
                "5": [
                    group_object.bar_genecopy_distribution,
                    "Gene copy distribution"
                ]
            }

            for i in arg.plots:
                if i == "1":
                    print_col(
                        "Plotting option 1 requires multiple group "
                        "files. Skipping.", YELLOW, 3)
                    continue

                # Generate plot data and file
                print_col("Generating plot for %s" % plt_methods[i][1], GREEN,
                          3)
                plot_obj, _, table = plt_methods[i][0](dest=output_dir)

        # Export filtered group file
        if arg.export:
            print_col(
                "Exporting filtered group file using %s maximum gene "
                "copies and %s minimum taxa representation" %
                (gene_threshold, species_threshold), GREEN, 3)
            group_object.export_filtered_group(dest=output_dir)
            print_col(
                "Filtering complete.\nTotal orthologs: %s;\nAfter gene "
                "filter: %s;\nAfter species filter: %s;\nAfter both "
                "filters: %s" % (len(group_object.species_frequency),
                                 group_object.num_gene_compliant,
                                 group_object.num_species_compliant,
                                 group_object.all_compliant), GREEN, 3)

        if arg.groups2fasta:
            print_col("Exporting group file as protein sequence files", GREEN,
                      3)
            # Set sqlite file
            sqldb = join(".tmp", "group2protein.db")
            group_object.retrieve_sequences(sqldb, protein_db, output_dir)

    else:
        print_col("Parsing %s group files" % len(groups_file), GREEN, 3)
        multiple_groups_object = OT.MultiGroupsLight(".tmp", groups_file,
                                                     gene_threshold,
                                                     species_threshold)

        if arg.plots:

            for i in arg.plots:

                if i != "1":
                    print_col(
                        "Plotting option %s requires a single group "
                        "file as input. Skipping." % str(i), YELLOW, 3)
                    continue

                print_col("Generating plot for Multiple group comparison",
                          GREEN, 3)
                multiple_groups_object.update_filters(gene_threshold,
                                                      species_threshold)
                multiple_groups_object.bar_orthologs(dest=output_dir)

        if arg.export:

            for gname, gobj in multiple_groups_object:

                gname = os.path.basename(gname)
                output_file = os.path.splitext(gname)[0] + "_filtered.txt"

                print_col(
                    "Exporting group file %s using %s maximum gene "
                    "copies and %s minimum taxa representation" %
                    (gname, gene_threshold, species_threshold), GREEN, 3)
                gobj.export_filtered_group(output_file_name=output_file,
                                           dest=output_dir)
                print_col(
                    "Filtering complete for group file %s.\nTotal "
                    "orthologs: %s;\nAfter gene filter: %s;\nAfter "
                    "species filter: %s;\nAfter both filters: %s" %
                    (gname, len(
                        gobj.species_frequency), gobj.num_gene_compliant,
                     gobj.num_species_compliant, gobj.all_compliant), GREEN, 3)
def main():

    # The inclusion of the argument definition in main, makes it possible to
    # import this file as a module and not triggering argparse. The
    # alternative of using a if __name__ == "__main__" statement does not
    # work well with the entry_points parameter of setup.py, since they call
    # the main function but do nothing inside said statement.
    parser = argparse.ArgumentParser(description="Command line interface for "
                                     "TriFusion Orthology search module")

    parser.add_argument("-in",
                        dest="infile",
                        type=str,
                        required=True,
                        help="Provide the path "
                        "to the directory containing the proteome files")

    # Execution modes
    exec_modes = parser.add_argument_group("Execution modes")
    exec_modes.add_argument("-n",
                            action="store_const",
                            const=True,
                            dest="normal",
                            help="Complete run of the pipeline")
    exec_modes.add_argument("-a",
                            action="store_const",
                            const=True,
                            dest="adjust",
                            help="Only adjust proteome fasta files")
    exec_modes.add_argument("-na",
                            action="store_const",
                            const=True,
                            dest="no_adjust",
                            help="Complete run of the pipeline without "
                            "adjusting fasta files")

    # Input formatting
    input_format = parser.add_argument_group("Input formatting")
    input_format.add_argument("-d",
                              action="store_const",
                              const=True,
                              dest="code",
                              help="Do not convert input proteome"
                              " file names because the file names are already "
                              "in code (e.g. Homo_sapiens.fas -> HoSap.fas")
    input_format.add_argument("-sep",
                              dest="separator",
                              help="Specify the "
                              "separator in the input files (e.g. '_' is the"
                              " separator in 'Homo_sapiens.fas'). This "
                              "parameter is ignored if the '-d' option is set")

    # Search options
    search_opts = parser.add_argument_group("Ortholog search options")
    search_opts.add_argument("--usearch",
                             dest="usearch_bin",
                             default="usearch",
                             help="Provide the path to the USEARCH executable."
                             " If the executable is already in your "
                             "PATH environment variable, specify only"
                             " the name of the executable (default is "
                             "'%(default)s')")
    search_opts.add_argument("--mcl",
                             dest="mcl_bin",
                             default="mcl",
                             help="Provide the path to the MCL executable."
                             " If the executable is already in your "
                             "PATH environment variable, specify only"
                             " the name of the executable (default is "
                             "'%(default)s')")
    search_opts.add_argument(
        "--min-length",
        dest="min_length",
        type=int,
        default=10,
        help="Set minimum length allowed "
        "for protein sequences (default is '%(default)s')")
    search_opts.add_argument("--max-stop",
                             dest="max_stop",
                             type=int,
                             default=20,
                             help="Set maximum percentage of "
                             "stop codons in protein sequences (default is "
                             "'%(default)s')")
    search_opts.add_argument("--db",
                             dest="database",
                             default="goodProteins",
                             help="Name of search "
                             "database (default is '%(default)s')")
    search_opts.add_argument("--search-out",
                             dest="search_out",
                             default="AllVsAll.out",
                             help="Name of the "
                             "search output file containing the All-vs-All "
                             "protein comparisons")
    search_opts.add_argument("-evalue",
                             dest="evalue",
                             default=1E-5,
                             help="Set the e-value cut off for search "
                             "operation (default is '%(default)s')")
    search_opts.add_argument("-inflation",
                             dest="inflation",
                             nargs="+",
                             default=["3"],
                             choices=[str(x) for x in xrange(1, 6)],
                             help="Set inflation values for ortholog group"
                             " clustering. Multiple values may be provided "
                             "but values are limited to the range [1, 5]")

    # Output options
    output_opts = parser.add_argument_group("Output options")
    output_opts.add_argument("-o",
                             dest="output_dir",
                             default=os.getcwd(),
                             help="Output directory")
    output_opts.add_argument("-prefix",
                             dest="prefix",
                             default="Ortholog",
                             help="Set the prefix name for each ortholog "
                             "cluster (default is '%(default)s')")
    output_opts.add_argument("-id",
                             dest="id_num",
                             type=int,
                             default=1,
                             help="Set the starting number for the ortholog "
                             "clusters (default is '%(default)s')")
    output_opts.add_argument("--groups-file",
                             dest="groups_file",
                             default="groups",
                             help="Set the name of the "
                             "group files from the output of MCL (default is "
                             "'%(default)s')")
    output_opts.add_argument("--min-species",
                             dest="min_sp",
                             default=1,
                             type=float,
                             help="Set the minimum number of "
                             "species required for an ortholog cluster to be "
                             "converted into protein sequence. This option "
                             "will only affect the protein sequence files, "
                             "not the group file output.")
    output_opts.add_argument("--max-gene-copy",
                             dest="max_gn",
                             default=100,
                             type=int,
                             help="Set the maximum number of gene "
                             "copies from the same taxon for each ortholog "
                             "cluster. This option will only affect the "
                             "protein sequence files, not the group file "
                             "output.")

    # Miscellaneous options
    misc_options = parser.add_argument_group("Miscellaneous options")
    misc_options.add_argument("-np",
                              dest="cpus",
                              default=1,
                              help="Number of "
                              "CPUs to be used during search operation ("
                              "default is '%(default)s')")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    arg = parser.parse_args()

    # Crete temp directory
    tmp_dir = join(os.getcwd(), ".tmp")
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    print_col(
        "Executing OrthoMCL pipeline at %s %s" %
        (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 1)

    try:
        start_time = time.time()

        # Arguments
        input_dir = arg.infile
        output_dir = arg.output_dir
        # name_separator = arg.separator
        min_length = arg.min_length
        max_percent_stop = arg.max_stop
        usearch_bin = arg.usearch_bin
        mcl_bin = arg.mcl_bin
        database_name = join(os.getcwd(), output_dir, "backstage_files",
                             arg.database)
        usearch_out_name = arg.search_out
        evalue_cutoff = arg.evalue
        cpus = arg.cpus
        inflation = arg.inflation
        prefix = arg.prefix
        start_id = arg.id_num
        groups_file = arg.groups_file
        min_sp = arg.min_sp
        max_gn = arg.max_gn

        # Check USEARCH bin
        check_bin_path(usearch_bin, "usearch")
        # Check MCL bin
        check_bin_path(mcl_bin, "mcl")

        sql_path = join(tmp_dir, "sqldb.db")

        # Get proteome files
        if not os.path.exists(input_dir):
            print_col(
                "The input directory %s does not exist. Exiting." % input_dir,
                RED, 1)

        proteome_files = [
            abspath(join(input_dir, x)) for x in os.listdir(input_dir)
        ]

        # Create and change working directory
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # os.chdir(output_dir)

        # Create directory that will store intermediate files during orthology
        # search
        int_dir = join(output_dir, "backstage_files")
        if not os.path.exists(int_dir):
            os.makedirs(int_dir)

        if arg.normal:
            install_schema(tmp_dir)
            adjust_fasta(proteome_files, output_dir)
            filter_fasta(min_length, max_percent_stop, database_name,
                         output_dir)
            allvsall_usearch(database_name,
                             evalue_cutoff,
                             output_dir,
                             cpus,
                             usearch_out_name,
                             usearch_bin=usearch_bin)
            blast_parser(usearch_out_name, output_dir, tmp_dir, None)
            pairs(tmp_dir)
            dump_pairs(tmp_dir, output_dir)
            mcl(inflation, output_dir, mcl_file=mcl_bin)
            mcl_groups(inflation, prefix, start_id, groups_file, output_dir)
            export_filtered_groups(inflation, groups_file, max_gn, min_sp,
                                   sql_path, database_name, tmp_dir,
                                   output_dir)

        elif arg.adjust:
            adjust_fasta(proteome_files, output_dir)

        elif arg.no_adjust:
            install_schema(tmp_dir)
            filter_fasta(min_length, max_percent_stop, database_name,
                         output_dir)
            allvsall_usearch(database_name,
                             evalue_cutoff,
                             output_dir,
                             cpus,
                             usearch_out_name,
                             usearch_bin=usearch_bin)
            blast_parser(usearch_out_name, output_dir, tmp_dir, None)
            pairs(tmp_dir)
            dump_pairs(tmp_dir, output_dir)
            mcl(inflation, output_dir, mcl_file=mcl_bin)
            mcl_groups(inflation, prefix, start_id, groups_file, output_dir)
            export_filtered_groups(inflation, groups_file, max_gn, min_sp,
                                   sql_path, database_name, tmp_dir,
                                   output_dir)

        print_col(
            "OrthoMCL pipeline execution successfully completed in %s "
            "seconds" % (round(time.time() - start_time, 2)), GREEN, 1)

        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
    except Exception as e:
        print(e.message)
        traceback.print_exc()
        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
        print_col("Program exited with errors!", RED, 1)
예제 #29
0
def pairs(db_dir, nm=None):

    print_col("Finding pairs for orthoMCL", GREEN, 1)

    make_pairs_sqlite.execute(db_dir, nm=nm)
예제 #30
0
def dump_pairs(db_dir, dest, nm=None):

    print_col("Dump files from the database produced by the orthomclPairs "
              "program", GREEN, 1)

    dump_pairs_sqlite.execute(db_dir, dest, nm=nm)
def pairs(db_dir, nm=None):

    print_col("Finding pairs for orthoMCL", GREEN, 1)

    make_pairs_sqlite.execute(db_dir, nm=nm)
예제 #32
0
def check_dirs(dir_path):

    if not os.path.exists(dir_path):
        print_col("The following path does not exist: {}".format(dir_path),
                  RED, 1)
예제 #33
0
def main():

    # The inclusion of the argument definition in main, makes it possible to
    # import this file as a module and not triggering argparse. The
    # alternative of using a if __name__ == "__main__" statement does not
    # work well with the entry_points parameter of setup.py, since they call
    # the main function but do nothing inside said statement.
    parser = argparse.ArgumentParser(description="Command line interface for "
        "TriFusion Orthology search module")

    parser.add_argument("-in", dest="infile", type=str,
                        help="Provide the path "
                        "to the directory containing the proteome files")

    # Execution modes
    exec_modes = parser.add_argument_group("Execution modes")
    exec_modes.add_argument("-n", action="store_const", const=True,
                            dest="normal",
                            help="Complete run of the pipeline")
    exec_modes.add_argument("-a", action="store_const", const=True,
                            dest="adjust",
                            help="Only adjust proteome fasta files")
    exec_modes.add_argument("-na", action="store_const", const=True,
                            dest="no_adjust",
                            help="Complete run of the pipeline without "
                                 "adjusting fasta files")

    # Input formatting
    input_format = parser.add_argument_group("Input formatting")
    input_format.add_argument("-d", action="store_const", const=True,
                              dest="code", help="Do not convert input proteome"
                              " file names because the file names are already "
                              "in code (e.g. Homo_sapiens.fas -> HoSap.fas")
    input_format.add_argument("-sep", dest="separator", help="Specify the "
                              "separator in the input files (e.g. '_' is the"
                              " separator in 'Homo_sapiens.fas'). This "
                              "parameter is ignored if the '-d' option is set")

    # Search options
    search_opts = parser.add_argument_group("Ortholog search options")
    search_opts.add_argument("--usearch", dest="usearch_bin",
                             default="usearch",
                             help="Provide the path to the USEARCH executable."
                                  " If the executable is already in your "
                                  "PATH environment variable, specify only"
                                  " the name of the executable (default is "
                                  "'%(default)s')")
    search_opts.add_argument("--mcl", dest="mcl_bin", default="mcl",
                             help="Provide the path to the MCL executable."
                                  " If the executable is already in your "
                                  "PATH environment variable, specify only"
                                  " the name of the executable (default is "
                                  "'%(default)s')")
    search_opts.add_argument("--min-length", dest="min_length", type=int,
                             default=10, help="Set minimum length allowed "
                             "for protein sequences (default is '%(default)s')")
    search_opts.add_argument("--max-stop", dest="max_stop", type=int,
                             default=20, help="Set maximum percentage of "
                             "stop codons in protein sequences (default is "
                             "'%(default)s')")
    search_opts.add_argument("--db", dest="database",
                             default="goodProteins", help="Name of search "
                             "database (default is '%(default)s')")
    search_opts.add_argument("--search-out", dest="search_out",
                             default="AllVsAll.out", help="Name of the "
                             "search output file containing the All-vs-All "
                             "protein comparisons")
    search_opts.add_argument("-evalue", dest="evalue", default=1E-5,
                             help="Set the e-value cut off for search "
                             "operation (default is '%(default)s')")
    search_opts.add_argument("-inflation", dest="inflation", nargs="+",
                             default=["3"],
                             choices=[str(x) for x in xrange(1, 6)],
                             help="Set inflation values for ortholog group"
                             " clustering. Multiple values may be provided "
                             "but values are limited to the range [1, 5]")

    # Output options
    output_opts = parser.add_argument_group("Output options")
    output_opts.add_argument("-o", dest="output_dir", default=os.getcwd(),
                             help="Output directory")
    output_opts.add_argument("-prefix", dest="prefix", default="Ortholog",
                             help="Set the prefix name for each ortholog "
                             "cluster (default is '%(default)s')")
    output_opts.add_argument("-id", dest="id_num", type=int, default=1,
                             help="Set the starting number for the ortholog "
                             "clusters (default is '%(default)s')")
    output_opts.add_argument("--groups-file", dest="groups_file",
                             default="groups", help="Set the name of the "
                             "group files from the output of MCL (default is "
                             "'%(default)s')")
    output_opts.add_argument("--min-species", dest="min_sp", default=1,
                             type=float, help="Set the minimum number of "
                             "species required for an ortholog cluster to be "
                             "converted into protein sequence. This option "
                             "will only affect the protein sequence files, "
                             "not the group file output.")
    output_opts.add_argument("--max-gene-copy", dest="max_gn", default=100,
                             type=int, help="Set the maximum number of gene "
                             "copies from the same taxon for each ortholog "
                             "cluster. This option will only affect the "
                             "protein sequence files, not the group file "
                             "output.")

    # Miscellaneous options
    misc_options = parser.add_argument_group("Miscellaneous options")
    misc_options.add_argument("-np", dest="cpus", default=1, help="Number of "
                              "CPUs to be used during search operation ("
                              "default is '%(default)s')")
    misc_options.add_argument("-v", "--version", dest="version",
                              action="store_const", const=True,
                              help="Displays software version")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    arg = parser.parse_args()

    if arg.version:
        print(__version__)
        sys.exit(1)

    # Crete temp directory
    tmp_dir = join(os.getcwd(), ".tmp")
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    print_col("Executing OrthoMCL pipeline at %s %s" % (
        time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 1)

    try:
        start_time = time.time()

        # Arguments
        input_dir = os.path.abspath(arg.infile)
        check_dirs(input_dir)
        output_dir = os.path.abspath(arg.output_dir)

        # name_separator = arg.separator
        min_length = arg.min_length
        max_percent_stop = arg.max_stop
        usearch_bin = arg.usearch_bin
        mcl_bin = arg.mcl_bin
        database_name = join(os.getcwd(), output_dir, "backstage_files",
                             arg.database)
        usearch_out_name = arg.search_out
        evalue_cutoff = arg.evalue
        cpus = arg.cpus
        inflation = arg.inflation
        prefix = arg.prefix
        start_id = arg.id_num
        groups_file = arg.groups_file
        min_sp = arg.min_sp
        max_gn = arg.max_gn

        # Check USEARCH bin
        check_bin_path(usearch_bin, "usearch")
        # Check MCL bin
        check_bin_path(mcl_bin, "mcl")

        sql_path = join(tmp_dir, "sqldb.db")

        # Get proteome files
        if not os.path.exists(input_dir):
            print_col("The input directory %s does not exist. Exiting." %
                      input_dir, RED, 1)

        proteome_files = [abspath(join(input_dir, x)) for x in os.listdir(
            input_dir)]

        # Create and change working directory
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # os.chdir(output_dir)

        # Create directory that will store intermediate files during orthology
        # search
        int_dir = join(output_dir, "backstage_files")
        if not os.path.exists(int_dir):
            os.makedirs(int_dir)

        if arg.normal:
            install_schema(tmp_dir)
            adjust_fasta(proteome_files, output_dir)
            filter_fasta(min_length, max_percent_stop, database_name,
                         output_dir)
            allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
                             usearch_out_name, usearch_bin=usearch_bin)
            blast_parser(usearch_out_name, output_dir, tmp_dir, None)
            pairs(tmp_dir)
            dump_pairs(tmp_dir, output_dir)
            mcl(inflation, output_dir, mcl_file=mcl_bin)
            mcl_groups(inflation, prefix, start_id, groups_file, output_dir)
            export_filtered_groups(inflation, groups_file, max_gn, min_sp,
                                   sql_path, database_name, tmp_dir,
                                   output_dir)

        elif arg.adjust:
            adjust_fasta(proteome_files, output_dir)

        elif arg.no_adjust:
            install_schema(tmp_dir)
            filter_fasta(min_length, max_percent_stop, database_name,
                         output_dir)
            allvsall_usearch(database_name, evalue_cutoff, output_dir, cpus,
                             usearch_out_name, usearch_bin=usearch_bin)
            blast_parser(usearch_out_name, output_dir, tmp_dir, None)
            pairs(tmp_dir)
            dump_pairs(tmp_dir, output_dir)
            mcl(inflation, output_dir, mcl_file=mcl_bin)
            mcl_groups(inflation, prefix, start_id, groups_file, output_dir)
            export_filtered_groups(inflation, groups_file, max_gn, min_sp,
                                   sql_path, database_name, tmp_dir,
                                   output_dir)

        print_col("OrthoMCL pipeline execution successfully completed in %s "
                  "seconds" % (round(time.time() - start_time, 2)), GREEN, 1)

        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
    except Exception as e:
        print(e.message)
        traceback.print_exc()
        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
        print_col("Program exited with errors!", RED, 1)
예제 #34
0
def main_checks(arg):

    if not arg.infile and not arg.generate_cfg:
        print_col("Must provide input data using the '-in' option", RED, 2)
예제 #35
0
def stats_main(args):

    print_col("Executing TriStats module at %s %s" % (
        time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 2)

    if args.generate_cfg:
        print_col("Generating configuration template file", GREEN, 2)
        return generate_cfg_template()

    # Create temporary directory
    tmp_dir = ".trifusion-temp"
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Set path to temporary sqlite database
    sql_db = os.path.join(tmp_dir, "trifusion.db")

    # Arguments
    input_files = args.infile
    output_dir = args.project_name
    config_file = args.config_file

    # Read configuration file
    print_col("Reading configuration file", GREEN, 2)
    settings = configparser.ConfigParser()
    settings.read(config_file)

    # Parse alignments
    # Support wildcards as arguments for windows
    fl = []
    if sys.platform in ["win32", "cygwin"]:
        for p in input_files:
            fl += glob(p)
        input_files = fl

    print_col("Parsing %s alignments" % len(input_files), GREEN, 2)
    alignments = AlignmentList(input_files, sql_db=sql_db)

    # Create output dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Variable mapping each available option with the appropriate statistics
    # and plotting methods
    func_map = {
        ("general information", "distribution_sequence_size", "species"):
            [alignments.average_seqsize_per_species,
             (box_plot, "avg_seqsize_species.png")],

        ("general information", "distribution_sequence_size", "average"):
            [alignments.average_seqsize,
             (histogram_plot, "avg_seqsize.png")],

        ("general information", "proportion_nucleotides_residues", "species"):
            [alignments.characters_proportion_per_species,
             (stacked_bar_plot, "char_proportions_sp.png")],

        ("general information", "proportion_nucleotides_residues", "average"):
            [alignments.characters_proportion,
             (bar_plot, "char_proportions.png")],

        ("general information", "distribution_taxa_frequency", "average"):
            [alignments.taxa_distribution,
             (histogram_plot, "distribution_taxa_frequency.png")],

        ("polymorphism and variation", "sequence_similarity", "species"):
            [alignments.sequence_similarity_per_species,
             (triangular_heat, "similarity_distribution_sp.png")],

        ("polymorphism and variation", "sequence_similarity", "average"):
            [alignments.sequence_similarity,
             (histogram_plot, "similarity_distribution.png")],

        ("polymorphism and variation", "sequence_similarity", "gene"):
            [alignments.sequence_similarity_gene,
             (sliding_window, "similarity_distribution_gn.png")],

        ("polymorphism and variation", "segregating_sites", "species"):
            [alignments.sequence_segregation_per_species,
             (triangular_heat, "segregating_sites_sp.png")],

        ("polymorphism and variation", "segregating_sites", "average"):
            [alignments.sequence_segregation,
             (histogram_plot, "segregating_sites.png")],

        ("polymorphism and variation", "segregating_sites", "gene"):
            [alignments.sequence_segregation_gene,
             (sliding_window, "segregating_sites_gn.png")],

        ("polymorphism and variation", "alignment_pol_correlation", "average"):
            [alignments.length_polymorphism_correlation,
             (scatter_plot, "length_polymorphism_correlation.png")],

        ("polymorphism and variation", "allele_frequency_spectrum", "average"):
            [alignments.allele_frequency_spectrum,
             (histogram_plot, "allele_frequency_spectrum.png")],

        ("polymorphism and variation", "allele_frequency_spectrum", "gene"):
            [alignments.allele_frequency_spectrum_gene,
             (histogram_plot, "allele_frequency_spectrum_gn.png")],

        ("missing data", "gene_occupancy", "average"):
            [alignments.gene_occupancy,
             (interpolation_plot, "gene_occupancy.png")],

        ("missing data", "distribution_missing_genes", "species"):
            [alignments.missing_genes_per_species,
             (bar_plot, "missing_gene_distribution.png")],

        ("missing data", "distribution_missing_genes", "average"):
            [alignments.missing_genes_average,
             (histogram_plot, "missing_gene_distribution_avg.png")],

        ("missing data", "distribution_missing_data", "species"):
            [alignments.missing_data_per_species,
             (stacked_bar_plot, "missing_data_distribution_sp.png")],

        ("missing data", "distribution_missing_data", "average"):
            [alignments.missing_data_distribution,
             (histogram_smooth, "missing_data_distribution.png")],

        ("missing data", "cumulative_distribution_missing_genes", "average"):
            [alignments.cumulative_missing_genes,
             (bar_plot, "cumulative_distribution_missing_genes.png")],

        ("outlier detection", "missing_data_outliers", "species"):
            [alignments.outlier_missing_data_sp,
             (outlier_densisty_dist, "Missing_data_outliers_sp.png")],

        ("outlier detection", "missing_data_outliers", "average"):
            [alignments.outlier_missing_data,
             (outlier_densisty_dist, "Missing_data_outliers.png")],

        ("outlier detection", "segregating_sites_outliers", "species"):
            [alignments.outlier_segregating_sp,
             (outlier_densisty_dist, "Segregating_sites_outliers_sp.png")],

        ("outlier detection", "segregating_sites_outliers", "average"):
            [alignments.outlier_segregating,
             (outlier_densisty_dist, "Segregating_sites_outliers.png")],

        ("outlier detection", "sequence_size_outliers", "species"):
            [alignments.outlier_sequence_size_sp,
             (outlier_densisty_dist, "Sequence_size_outliers_sp.png")],

        ("outlier detection", "sequence_size_outliers", "average"):
            [alignments.outlier_sequence_size,
             (outlier_densisty_dist, "Sequence_size_outliers.png")]
    }

    print_col("Parsing configuation file options", GREEN, 2)

    # Iterate over each individual option
    for section in settings.sections():
        for option, val in settings.items(section):
            for i in val.split():

                section = section.lower()
                # Check if current option is available or supported
                if (section, option, i) in func_map:
                    print_col("Generating plot for option: %s - %s - %s" %
                              (section, option, i), GREEN, 2)
                    # Get appropriate method list
                    funcs = func_map[(section, option, i)]
                    # Retrieve plot data using statistics method
                    plot_data = funcs[0]()

                    # Check for exceptions in plot data
                    if "exception" in plot_data:
                        if plot_data["exception"] is EmptyData:
                            print_col("Option %s - %s - %s has no data for "
                                      "plotting" % (section, option, i),
                                      YELLOW, 2)
                        if plot_data["exception"] is InvalidSequenceType:
                            print_col("Invalid sequence type for option %s - "
                                      "%s - %s (%s)" %
                                      (section, option, i,
                                       alignments.sequence_code[0]), YELLOW, 2)
                        continue

                    # Generate plot object
                    plot_obj, _, lgd = funcs[1][0](**plot_data)
                    plot_obj.tight_layout()

                    # Save plot to file, including the legend object, if
                    # available
                    if lgd:
                        plot_obj.savefig(join(output_dir, funcs[1][1]),
                                         bbox_extra_artists=(lgd,), dpi=200)
                    else:
                        plot_obj.savefig(join(output_dir, funcs[1][1]),
                                         dpi=200)
                else:
                    print_col("Invalid option: %s - %s - %s. Skipping." %
                              (section, option, i), YELLOW, 2)
예제 #36
0
def main():

    print_col("Executing TriOrtho module at %s %s" % (
        time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 3)

    # Create tmp dir
    os.makedirs(".tmp")

    # Arguments
    groups_file = arg.infile
    output_dir = arg.output_dir

    # Create output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if arg.protein2dna:
        print_col("Converting protein sequences into nucleotide sequences",
                  GREEN, 3)
        # Create database
        print_col("Creating database", GREEN, 3)
        id_db = protein2dna.create_db(arg.dna_db, ".tmp")
        # Create query for USEARCH
        print_col("Creating query", GREEN, 3)
        query_db = protein2dna.create_query(arg.protein_db, ".tmp")
        # Execute search
        print_col("Executing search", GREEN, 3)
        protein2dna.pair_search(".tmp")
        pair_db = protein2dna.get_pairs(".tmp")
        # Convert files
        print_col("Converting files", GREEN, 3)
        protein2dna.convert_protein_file(pair_db, query_db, id_db, output_dir)
        return print_col("Protein to nucleotide conversion complete", GREEN, 3)

    gene_threshold = arg.gn_threshold
    species_threshold = arg.sp_threshold
    protein_db = arg.protein_db

    if len(groups_file) == 1:

        print_col("Parsing group file", GREEN, 3)
        group_file = groups_file[0]
        group_object = OT.GroupLight(group_file, gene_threshold,
                                     species_threshold)

        # Check for plotting options
        if arg.plots:

            plt_methods = {"2": [group_object.bar_species_distribution,
                                 "Species distribution"],
                           "3": [group_object.bar_species_coverage,
                                 "Species data coverage"],
                           "4": [group_object.bar_genecopy_per_species,
                                 "Gene copies per species"],
                           "5": [group_object.bar_genecopy_distribution,
                                 "Gene copy distribution"]}

            for i in arg.plots:
                if i == "1":
                    print_col("Plotting option 1 requires multiple group "
                              "files. Skipping.", YELLOW, 3)
                    continue

                # Generate plot data and file
                print_col("Generating plot for %s" % plt_methods[i][1],
                          GREEN, 3)
                plot_obj, _, table = plt_methods[i][0](dest=output_dir)

        # Export filtered group file
        if arg.export:
            print_col("Exporting filtered group file using %s maximum gene "
                      "copies and %s minimum taxa representation" %
                      (gene_threshold, species_threshold), GREEN, 3)
            group_object.export_filtered_group(dest=output_dir)
            print_col("Filtering complete.\nTotal orthologs: %s;\nAfter gene "
                      "filter: %s;\nAfter species filter: %s;\nAfter both "
                      "filters: %s" % (len(group_object.species_frequency),
                                       group_object.num_gene_compliant,
                                       group_object.num_species_compliant,
                                       group_object.all_compliant), GREEN, 3)

        if arg.groups2fasta:
            print_col("Exporting group file as protein sequence files",
                      GREEN, 3)
            # Set sqlite file
            sqldb = join(".tmp", "group2protein.db")
            group_object.retrieve_sequences(sqldb, protein_db, output_dir)

    else:
        print_col("Parsing %s group files" % len(groups_file), GREEN, 3)
        multiple_groups_object = OT.MultiGroupsLight(".tmp",
                                                     groups_file,
                                                     gene_threshold,
                                                     species_threshold)

        if arg.plots:

            for i in arg.plots:

                if i != "1":
                    print_col("Plotting option %s requires a single group "
                              "file as input. Skipping." % str(i), YELLOW, 3)
                    continue

                print_col("Generating plot for Multiple group comparison",
                          GREEN, 3)
                multiple_groups_object.update_filters(gene_threshold,
                                                      species_threshold)
                multiple_groups_object.bar_orthologs(dest=output_dir)

        if arg.export:

            for gname, gobj in multiple_groups_object:

                gname = os.path.basename(gname)
                output_file = os.path.splitext(gname)[0] + "_filtered.txt"

                print_col("Exporting group file %s using %s maximum gene "
                          "copies and %s minimum taxa representation" %
                          (gname, gene_threshold, species_threshold), GREEN, 3)
                gobj.export_filtered_group(output_file_name=output_file,
                                           dest=output_dir)
                print_col("Filtering complete for group file %s.\nTotal "
                          "orthologs: %s;\nAfter gene filter: %s;\nAfter "
                          "species filter: %s;\nAfter both filters: %s" %
                          (gname,
                           len(gobj.species_frequency),
                           gobj.num_gene_compliant,
                           gobj.num_species_compliant,
                           gobj.all_compliant), GREEN, 3)
예제 #37
0
def triseq_arg_check(arg):

    if arg.gcoder and "nexus" not in arg.output_format:
        print_col("Gap coding can only be performed for Nexus output format.",
                  RED)

    if arg.gcoder and "nexus" in arg.output_format and \
            arg.output_format != ["nexus"]:
        print_col("Gap coding can only be performed for Nexus output format."
                  " This operation will be ignored for other output formats.",
                  YELLOW, quiet=arg.quiet)

    if arg.conversion and arg.reverse:
        print_col("Ignoring conversion flag (-c) when specifying reverse"
                  " concatenation (-r)", YELLOW, quiet=arg.quiet)

    if arg.outfile and arg.reverse:
        print_col("Ignoring output file option (-o) when specifying reverse"
                  " concatenation (-r)", YELLOW, quiet=arg.quiet)

    if arg.partition_file is not None and arg.outfile is None:
        print_col("An output file must be provided with option '-o'", RED)

    if "ima2" in arg.output_format and arg.ima2_params is None:
        print_col("Additional arguments must be provided with the"
                  " option --ima2-params when selecting ima2 output format",
                  RED)

    if "ima2" in arg.output_format and len(arg.ima2_params) != 4:
        print_col("Four additional arguments must be provided with"
                  " option --ima2-params when selecting the "
                  "ima2 output format. %s were given" %
                  (len(arg.ima2_params)), RED)

    if arg.partition_file is not None:
        return 0

    if arg.conversion is None and arg.outfile is None and arg.reverse is None\
            and arg.select is None and arg.get_taxa is False:

        print_col(
            "If you wish to concatenate provide the output file name using "
            "the '-o' option. If you wish to convert a "
            "file, specify it using the '-c' option", RED)

    if len(arg.infile) == 1 and arg.conversion is None and arg.reverse is None\
            and arg.collapse is None:

        print_col(
            "Cannot perform concatenation of a single file. Please provide"
            " additional files to concatenate, or specify the conversion "
            "'-c' option", RED)

    if arg.zorro is not None and len(arg.infile) == 1:
        print_col(
            "The '-z' option cannot be invoked when only a single input "
            "file is provided. This option is reserved for"
            " concatenation of multiple alignment files", RED)

    if arg.consensus and arg.output_format != ["fasta"]:
        print_col("Output format must be only Fasta when using the "
                  "consensus option", RED)

    if not arg.consensus and arg.consensus_single:
        print_col("Ignoring consensus single file option (--consensus-single-"
                  "file) when the consensus operation is not specified",
                  YELLOW, quiet=arg.quiet)

    else:
        return 0
예제 #38
0
def main_checks(arg):

    if not arg.infile and not arg.generate_cfg:
        print_col("Must provide input data using the '-in' option", RED, 2)
예제 #39
0
def stats_main(args):

    print_col("Executing TriStats module at %s %s" % (
        time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, 2)

    if args.generate_cfg:
        print_col("Generating configuration template file", GREEN, 2)
        return generate_cfg_template()

    # Create temporary directory
    tmp_dir = ".trifusion-temp"
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Set path to temporary sqlite database
    sql_db = os.path.join(tmp_dir, "trifusion.db")

    # Arguments
    input_files = args.infile
    output_dir = args.project_name
    config_file = args.config_file

    # Read configuration file
    print_col("Reading configuration file", GREEN, 2)
    settings = configparser.ConfigParser()
    settings.read(config_file)

    # Parse alignments
    # Support wildcards as arguments for windows
    fl = []
    if sys.platform in ["win32", "cygwin"]:
        for p in input_files:
            fl += glob(p)
        input_files = fl

    print_col("Parsing %s alignments" % len(input_files), GREEN, 2)
    alignments = AlignmentList(input_files, sql_db=sql_db)

    # Create output dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Variable mapping each available option with the appropriate statistics
    # and plotting methods
    func_map = {
        ("general information", "distribution_sequence_size", "species"):
            [alignments.average_seqsize_per_species,
             (box_plot, "avg_seqsize_species.png")],

        ("general information", "distribution_sequence_size", "average"):
            [alignments.average_seqsize,
             (histogram_plot, "avg_seqsize.png")],

        ("general information", "proportion_nucleotides_residues", "species"):
            [alignments.characters_proportion_per_species,
             (stacked_bar_plot, "char_proportions_sp.png")],

        ("general information", "proportion_nucleotides_residues", "average"):
            [alignments.characters_proportion,
             (bar_plot, "char_proportions.png")],

        ("general information", "distribution_taxa_frequency", "average"):
            [alignments.taxa_distribution,
             (histogram_plot, "distribution_taxa_frequency.png")],

        ("polymorphism and variation", "sequence_similarity", "species"):
            [alignments.sequence_similarity_per_species,
             (triangular_heat, "similarity_distribution_sp.png")],

        ("polymorphism and variation", "sequence_similarity", "average"):
            [alignments.sequence_similarity,
             (histogram_plot, "similarity_distribution.png")],

        ("polymorphism and variation", "sequence_similarity", "gene"):
            [alignments.sequence_similarity_gene,
             (sliding_window, "similarity_distribution_gn.png")],

        ("polymorphism and variation", "segregating_sites", "species"):
            [alignments.sequence_segregation_per_species,
             (triangular_heat, "segregating_sites_sp.png")],

        ("polymorphism and variation", "segregating_sites", "average"):
            [alignments.sequence_segregation,
             (histogram_plot, "segregating_sites.png")],

        ("polymorphism and variation", "segregating_sites", "gene"):
            [alignments.sequence_segregation_gene,
             (sliding_window, "segregating_sites_gn.png")],

        ("polymorphism and variation", "alignment_pol_correlation", "average"):
            [alignments.length_polymorphism_correlation,
             (scatter_plot, "length_polymorphism_correlation.png")],

        ("polymorphism and variation", "allele_frequency_spectrum", "average"):
            [alignments.allele_frequency_spectrum,
             (histogram_plot, "allele_frequency_spectrum.png")],

        ("polymorphism and variation", "allele_frequency_spectrum", "gene"):
            [alignments.allele_frequency_spectrum_gene,
             (histogram_plot, "allele_frequency_spectrum_gn.png")],

        ("missing data", "gene_occupancy", "average"):
            [alignments.gene_occupancy,
             (interpolation_plot, "gene_occupancy.png")],

        ("missing data", "distribution_missing_genes", "species"):
            [alignments.missing_genes_per_species,
             (bar_plot, "missing_gene_distribution.png")],

        ("missing data", "distribution_missing_genes", "average"):
            [alignments.missing_genes_average,
             (histogram_plot, "missing_gene_distribution_avg.png")],

        ("missing data", "distribution_missing_data", "species"):
            [alignments.missing_data_per_species,
             (stacked_bar_plot, "missing_data_distribution_sp.png")],

        ("missing data", "distribution_missing_data", "average"):
            [alignments.missing_data_distribution,
             (histogram_smooth, "missing_data_distribution.png")],

        ("missing data", "cumulative_distribution_missing_genes", "average"):
            [alignments.cumulative_missing_genes,
             (bar_plot, "cumulative_distribution_missing_genes.png")],

        ("outlier detection", "missing_data_outliers", "species"):
            [alignments.outlier_missing_data_sp,
             (outlier_densisty_dist, "Missing_data_outliers_sp.png")],

        ("outlier detection", "missing_data_outliers", "average"):
            [alignments.outlier_missing_data,
             (outlier_densisty_dist, "Missing_data_outliers.png")],

        ("outlier detection", "segregating_sites_outliers", "species"):
            [alignments.outlier_segregating_sp,
             (outlier_densisty_dist, "Segregating_sites_outliers_sp.png")],

        ("outlier detection", "segregating_sites_outliers", "average"):
            [alignments.outlier_segregating,
             (outlier_densisty_dist, "Segregating_sites_outliers.png")],

        ("outlier detection", "sequence_size_outliers", "species"):
            [alignments.outlier_sequence_size_sp,
             (outlier_densisty_dist, "Sequence_size_outliers_sp.png")],

        ("outlier detection", "sequence_size_outliers", "average"):
            [alignments.outlier_sequence_size,
             (outlier_densisty_dist, "Sequence_size_outliers.png")]
    }

    print_col("Parsing configuation file options", GREEN, 2)

    # Iterate over each individual option
    for section in settings.sections():
        for option, val in settings.items(section):
            for i in val.split():

                section = section.lower()
                # Check if current option is available or supported
                if (section, option, i) in func_map:
                    print_col("Generating plot for option: %s - %s - %s" %
                              (section, option, i), GREEN, 2)
                    # Get appropriate method list
                    funcs = func_map[(section, option, i)]
                    # Retrieve plot data using statistics method
                    plot_data = funcs[0]()

                    # Check for exceptions in plot data
                    if "exception" in plot_data:
                        if plot_data["exception"] is EmptyData:
                            print_col("Option %s - %s - %s has no data for "
                                      "plotting" % (section, option, i),
                                      YELLOW, 2)
                        if plot_data["exception"] is InvalidSequenceType:
                            print_col("Invalid sequence type for option %s - "
                                      "%s - %s (%s)" %
                                      (section, option, i,
                                       alignments.sequence_code[0]), YELLOW, 2)
                        continue

                    # Generate plot object
                    plot_obj, _, lgd = funcs[1][0](**plot_data)
                    plot_obj.tight_layout()

                    # Save plot to file, including the legend object, if
                    # available
                    if lgd:
                        plot_obj.savefig(join(output_dir, funcs[1][1]),
                                         bbox_extra_artists=(lgd,), dpi=200)
                    else:
                        plot_obj.savefig(join(output_dir, funcs[1][1]),
                                         dpi=200)
                else:
                    print_col("Invalid option: %s - %s - %s. Skipping." %
                              (section, option, i), YELLOW, 2)
예제 #40
0
파일: TriSeq.py 프로젝트: avicens/TriFusion
def main_parser(arg, alignment_list):
    """ Function with the main operations of TriSeq """

    print_col("Executing TriSeq module at %s %s" %
              (time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")),
              GREEN,
              quiet=arg.quiet)

    # Create temp directory
    tmp_dir = ".trifusion-temp"
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Set path to temporary sqlite database
    sql_db = os.path.join(tmp_dir, "trifusion.db")

    # If database already exists, erase it. Make sure we start fresh.
    if os.path.exists(sql_db):
        os.remove(sql_db)

    # Defining main variables
    conversion = arg.conversion
    output_format = arg.output_format
    outfile = arg.outfile
    interleave = arg.interleave
    model_phy = arg.model_phy
    # outgroup_taxa = arg.outgroup_taxa

    # Defining output file name
    if conversion is None and arg.outfile is not None:
        outfile = "".join(arg.outfile)
    elif conversion is None and arg.outfile is not None:
        outfile = "".join(arg.outfile)
    elif arg.consensus and arg.consensus_single and not arg.outfile:
        outfile = "consensus"

    # The input file at this stage is not necessary
    # If just converting the partition file format do this and exit
    if arg.partition_file is not None and not alignment_list:
        # Initializing Partitions instance and reading partitions file
        partition = data.Partitions()
        partition.read_from_file(arg.partition_file, no_aln_check=True)
        if partition.partition_format == "nexus":
            partition.write_to_file("raxml", outfile, model_phy)
        else:
            partition.write_to_file("nexus", outfile)
        return 0

    # Support wildcars as arguments for windows
    fl = []
    if sys.platform in ["win32", "cygwin"]:
        for p in alignment_list:
            fl += glob(p)
        alignment_list = fl

    # Check input files for directories
    alignment_list, dirs, lost = check_infile_list(alignment_list)

    if dirs:
        print_col(
            "Ignoring input files pointing to a directory: {}".format(
                " ".join(dirs)), YELLOW)
    if lost:
        print_col(
            "Ignoring input files that do not exist: {}".format(
                " ".join(lost)), YELLOW)
    if not alignment_list:
        print_col("No valid input files have been provided. Terminating...",
                  RED)

    # Input alignments are mandatory from now on
    if not arg.quiet:
        pbar = ProgressBar(max_value=len(alignment_list), widgets=gen_wgt(""))
    else:
        pbar = None

    print_col("Parsing %s alignments" % len(alignment_list),
              GREEN,
              quiet=arg.quiet)
    alignments = seqset.AlignmentList(alignment_list, sql_db=sql_db, pbar=pbar)

    # If a partitions file was provided, and there is only a single input file,
    # try to associate the partitions.
    if len(alignment_list) == 1 and arg.partition_file:
        er = alignments.partitions.read_from_file(arg.partition_file)
        if er:
            print_col("Invalid partitions file.", RED)

    post_aln_checks(arg, alignments)

    # ################################ Utilities ##############################
    # Return a file with taxa list and exit
    if arg.get_taxa is True:
        print_col("Writing taxa to new file", GREEN, quiet=arg.quiet)
        alignments.write_taxa_to_file()
        return 0

    # Remove taxa
    if arg.remove:
        print_col("Removing taxa", GREEN, quiet=arg.quiet)
        alignments.remove_taxa(arg.remove)

    # Grep taxa
    if arg.grep:
        print_col("Grepping taxa", GREEN, quiet=arg.quiet)
        alignments.remove_taxa(arg.grep, mode="inverse")

    # Select alignments
    if arg.select:
        print_col("Selecting alignments", GREEN, quiet=arg.quiet)
        if not os.path.exists("Taxa_selection"):
            os.makedirs("Taxa_selection")

        # Check if any of the provided taxa is absent from the alignments
        absent_taxa = [x for x in arg.select if x not in alignments.taxa_names]
        if absent_taxa:
            print_col("The following taxa were not found in any alignment and"
                      " will be ignored: {}".format(" ".join(absent_taxa)),
                      YELLOW,
                      quiet=arg.quiet)

        selected_alignments = alignments.select_by_taxa(arg.select,
                                                        mode="relaxed")
        for aln in selected_alignments:
            alignment_file = aln.path
            shutil.copy(alignment_file, "Taxa_selection")

        return

    # ############################# Main operations ###########################
    # Reverse concatenation
    if arg.reverse is not None:
        print_col("Reverse concatenating", GREEN, quiet=arg.quiet)
        if len(alignment_list) > 1:
            raise ArgumentError("Only one input file allowed for reverse "
                                "concatenation")
        if arg.reverse:
            er = alignments.partitions.read_from_file(arg.reverse)
            if er:
                print_col("Invalid partitions file.", RED)

        alignments.reverse_concatenate(pbar=pbar)

    # Filtering
    # Filter by minimum taxa
    if arg.min_taxa:
        print_col("Filtering by minimum taxa", GREEN, quiet=arg.quiet)
        alignments.filter_min_taxa(arg.min_taxa, pbar=pbar)

    # Filter by alignments that contain taxa
    if arg.contain_filter:
        print_col("Filtering alignment(s) including a taxa group",
                  GREEN,
                  quiet=arg.quiet)
        alignments.filter_by_taxa(arg.contain_filter, "Contain", pbar=pbar)

    # Filter by alignments that exclude taxa
    if arg.exclude_filter:
        print_col("Filtering alignments excluding a taxa group",
                  GREEN,
                  quiet=arg.quiet)
        alignments.filter_by_taxa(arg.exclude_filter, "Exclude", pbar=pbar)

    # Filter by codon position
    if arg.codon_filter:
        print_col("Filtering by codon positions", GREEN, quiet=arg.quiet)
        if alignments.sequence_code[0] == "DNA":
            codon_settings = [
                True if str(x) in arg.codon_filter else False
                for x in range(1, 4)
            ]
            alignments.filter_codon_positions(codon_settings, pbar=pbar)

    # Filter by missing data
    if arg.m_filter:
        print_col("Filtering by missing data", GREEN, quiet=arg.quiet)
        alignments.filter_missing_data(arg.m_filter[0],
                                       arg.m_filter[1],
                                       pbar=pbar,
                                       use_main_table=True)

    # Filtering by variable sites
    if arg.var_filter:
        print_col("Filtering by variable sites", GREEN, quiet=arg.quiet)
        alignments.filter_segregating_sites(arg.var_filter[0],
                                            arg.var_filter[1],
                                            pbar=pbar)

    # Filtering by informative sites
    if arg.inf_filter:
        print_col("Filtering by variable sites", GREEN, quiet=arg.quiet)
        alignments.filter_informative_sites(arg.inf_filter[0],
                                            arg.inf_filter[1],
                                            pbar=pbar)

    # Concatenation
    if not arg.conversion and not arg.consensus and len(alignment_list) > 1:
        print_col("Concatenating", GREEN, quiet=arg.quiet)
        alignments.concatenate(pbar=pbar)

        # Concatenate zorro files
        if arg.zorro:
            zorro = data.Zorro(alignment_list, arg.zorro)
            zorro.write_to_file(outfile)

    # Collapsing
    if arg.collapse:
        print_col("Collapsing", GREEN, quiet=arg.quiet)
        alignments.collapse(use_main_table=True,
                            pbar=pbar,
                            haplotypes_file=outfile)

    # Gcoder
    if arg.gcoder:
        print_col("Coding gaps", GREEN, quiet=arg.quiet)
        if output_format == ["nexus"]:
            alignments.code_gaps(use_main_table=True, pbar=pbar)

    # Consensus
    if arg.consensus:
        consensus_type = arg.consensus[0]
        print_col("Creating consensus sequences", GREEN, quiet=arg.quiet)
        alignments.consensus(consensus_type,
                             single_file=arg.consensus_single,
                             pbar=pbar)

    # Write output
    print_col("Writing output", GREEN, quiet=arg.quiet)
    alignments.write_to_file(output_format,
                             output_file=outfile,
                             output_suffix=arg.output_suffix,
                             interleave=interleave,
                             ima2_params=arg.ima2_params,
                             partition_file=True,
                             use_charset=True,
                             pbar=pbar)
예제 #41
0
def triseq_arg_check(arg):

    if not arg.outfile and [x for x in arg.output_format
                            if x in ["mcmctree", "gphocs", "ima2", "snapp"]]:
        invalid_formats = [x for x in arg.output_format
                            if x in ["mcmctree", "gphocs", "ima2", "snapp"]]
        print_col("The following output formats can only be used with the"
                  " concatenation operation (-o): {}".format(", ".join(
                    invalid_formats)), RED)

    if arg.gcoder and "nexus" not in arg.output_format:
        print_col("Gap coding can only be performed for Nexus output format.",
                  RED)

    if arg.gcoder and "nexus" in arg.output_format and \
            arg.output_format != ["nexus"]:
        print_col("Gap coding can only be performed for Nexus output format."
                  " This operation will be ignored for other output formats.",
                  YELLOW, quiet=arg.quiet)

    if arg.conversion and arg.reverse:
        print_col("Ignoring conversion flag (-c) when specifying reverse"
                  " concatenation (-r)", YELLOW, quiet=arg.quiet)

    if arg.partition_file is not None and arg.outfile is None:
        print_col("An output file must be provided with option '-o'", RED)

    if "ima2" in arg.output_format and arg.ima2_params is None:
        print_col("Additional arguments must be provided with the"
                  " option --ima2-params when selecting ima2 output format",
                  RED)

    if "ima2" in arg.output_format and len(arg.ima2_params) != 4:
        print_col("Four additional arguments must be provided with"
                  " option --ima2-params when selecting the "
                  "ima2 output format. %s were given" %
                  (len(arg.ima2_params)), RED)

    if arg.partition_file is not None:
        return 0

    if arg.conversion is None and arg.outfile is None and arg.reverse is None\
            and arg.select is None and arg.get_taxa is False:

        print_col(
            "If you wish to concatenate, provide the output file name using "
            "the '-o' option. If you wish to convert a "
            "file, specify it using the '-c' option", RED)

    if arg.zorro is not None and len(arg.infile) == 1:
        print_col(
            "The '-z' option cannot be invoked when only a single input "
            "file is provided. This option is reserved for"
            " concatenation of multiple alignment files", RED)

    if arg.consensus and arg.output_format != ["fasta"]:
        print_col("Output format must be only Fasta when using the "
                  "consensus option", RED)

    if not arg.consensus and arg.consensus_single:
        print_col("Ignoring consensus single file option (--consensus-single-"
                  "file) when the consensus operation is not specified",
                  YELLOW, quiet=arg.quiet)

    else:
        return 0