def filter_by_occupancy(filename_in, filename_out, min_occupancy=0.5):
    """
    Filter an alignment in fasta format according to the occupancy of the 
    columns. Store the results in fasta format.
    """
    fasta_raw = fasta_to_dict(filename_in)

    n_sequences = len(fasta_raw.keys())
    alignment_length = len(fasta_raw[tuple(fasta_raw.keys())[0]])

    columns = tuple("".join(fasta_raw[seqname][column_index]
                            for seqname in fasta_raw.keys())
                    for column_index in range(alignment_length))

    columns_to_keep = []
    for column_number, column in enumerate(columns):
        n_gaps = column.count("-")
        if 1 - float(n_gaps) / float(n_sequences) >= min_occupancy:
            columns_to_keep.append(column_number)

    fasta_trimmed = {}
    for seqname, sequence in fasta_raw.items():
        fasta_trimmed[seqname] = "".join(fasta_raw[seqname][column_to_keep]
                                         for column_to_keep in columns_to_keep)

    if not os.path.exists(os.path.dirname(filename_out)):
        os.makedirs(os.path.dirname(filename_out))

    with open(filename_out, "w") as f_out:
        for seqname, sequence in fasta_trimmed.items():
            f_out.write(">{seqname}\n{sequence}\n".format(seqname=seqname,
                                                          sequence=sequence))
示例#2
0
def subset_file(pep_fn, cds_fn, cds_dict):
    """Write to cds_fn the cds sequences that are in pep_fn"""
    with open(cds_fn, "w") as cds_out:
        for seqid in fasta_to_dict(pep_fn).keys():
            cds_out.write(
                ">{seqid}\n{sequence}\n".format(seqid=seqid, sequence=cds_dict[seqid])
            )
示例#3
0
def remove_gaps(fasta_in, fasta_out):
    """Remove all gaps in the alignment"""
    with open(fasta_out, "w") as f_out:
        for name, sequence in fasta_to_dict(fasta_in).items():
            f_out.write(f">{name}\n{sequence.replace('-', '')}\n")
示例#4
0
def translate_fasta(fasta_in, fasta_out):
    """Translate an entire fasta aligment"""
    with open(fasta_out, "w") as f_out:
        for name, sequence in fasta_to_dict(fasta_in).items():
            f_out.write(f">{name}\n{translate(sequence)}\n")
示例#5
0
    """Write to cds_fn the cds sequences that are in pep_fn"""
    with open(cds_fn, "w") as cds_out:
        for seqid in fasta_to_dict(pep_fn).keys():
            cds_out.write(
                ">{seqid}\n{sequence}\n".format(seqid=seqid, sequence=cds_dict[seqid])
            )


if __name__ == "__main__":

    if len(sys.argv) != 6:
        sys.stderr.write(
            "Error. Usage: python split_cds.py folder_in ext_in folder_out "
            "ext_out all_cds.fa"
        )
        sys.exit(1)

    IN_DIR = fix_dir_path(sys.argv[1])
    IN_EXT = sys.argv[2]
    OUT_DIR = fix_dir_path(sys.argv[3])
    OUT_EXT = sys.argv[4]
    CDS_FN = sys.argv[5]

    CDS_DICT = fasta_to_dict(CDS_FN)

    def process(pep_fn, cds_fn):
        """Fix parameter"""
        return subset_file(pep_fn, cds_fn, CDS_DICT)

    process_folders(IN_DIR, IN_EXT, OUT_DIR, OUT_EXT, process)