Пример #1
0
    def extract_proteins_from_alignments(self, dir_with_alignments, output_dir):
        out_dir = self.check_path(output_dir)

        #print type(FileRoutines)

        input_files = self.make_list_of_path_to_files([dir_with_alignments] if isinstance(dir_with_alignments, str) else dir_with_alignments)

        self.safe_mkdir(out_dir)
        from Routines import MultipleAlignmentRoutines
        for filename in input_files:
            filename_list = self.split_filename(filename)
            output_file = "%s%s%s" % (out_dir, filename_list[1], filename_list[2])
            MultipleAlignmentRoutines.extract_sequences_from_alignment(filename, output_file)
Пример #2
0
    required=True,
    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
    help="Comma-separated list of files/directories with alignments")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write merged alignment")
parser.add_argument(
    "-c",
    "--coordinates_file",
    action="store",
    dest="coords_file",
    required=True,
    help="File to write file with coordinates of alignments in merged alignment"
)
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignments")

args = parser.parse_args()

MultipleAlignmentRoutines.merge_alignment(args.input,
                                          args.output,
                                          args.coords_file,
                                          format=args.format)
Пример #3
0
    action="store",
    dest="suffix",
    default=".gaps_removed",
    help="Suffix to use in output files. Default: '.gaps_removed'")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignment")
parser.add_argument("-v",
                    "--verbose",
                    action="store_true",
                    dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

save_mkdir(args.output)

for alignment_file in args.input:
    splited_filename = split_filename(alignment_file)
    if args.verbose:
        print("Handling %s ..." % alignment_file)
    output_filename = "%s%s%s%s" % (args.output, splited_filename[1],
                                    args.suffix, splited_filename[2])
    alignment = AlignIO.read(alignment_file, args.format)
    filtered_alignment = MultipleAlignmentRoutines.remove_columns_with_gaps(
        alignment, args.max_gap_number, gap_symbol=args.gap_symbol)
    AlignIO.write(filtered_alignment, output_filename, args.format)
Пример #4
0
from Routines import MultipleAlignmentRoutines, FileRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-p", "--protein_alignment", action="store", dest="pep_alignment", required=True,
                    help="File with protein alignment")
parser.add_argument("-c", "--cds_seqs", action="store", dest="cds_seqs", required=True,
                    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
                    help="Comma-separated list of files/directories with cds sequences")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="File to write codon alignment")
parser.add_argument("-a", "--accordance_file", action="store", dest="accordance_file",
                    help="File with CDS to protein id accordance")
parser.add_argument("-f", "--alignment_format", action="store", dest="alignment_format", default="fasta",
                    help="Format of alignments. Default: fasta")
parser.add_argument("-n", "--cds_seqs_format", action="store", dest="cds_format", default="fasta",
                    help="Format of cds sequences. Default: fasta")
parser.add_argument("-i", "--cds_index_file", action="store", dest="cds_index",
                    help="Biopython index of cds files. Default - construct new")
parser.add_argument("-r", "--retain_cds_index", action="store_true", dest="retain_cds_index",
                    help="Retain constructed index after analysis. Default - False")
args = parser.parse_args()

MultipleAlignmentRoutines.get_codon_alignment_from_files(args.pep_alignment, args.cds_seqs, args.output,
                                                         cds2protein_accordance_file=args.accordance_file,
                                                         alignment_format=args.alignment_format,
                                                         nucleotide_sequence_format=args.cds_format,
                                                         cds_index_file=args.cds_index,
                                                         retain_cds_index=args.retain_cds_index)
Пример #5
0
    "--output_directory",
    action="store",
    dest="output_dir",
    default="./",
    help=
    "Output directory to write resulting files. Default - current directory")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignments")
parser.add_argument("-g",
                    "--gap_symbol",
                    action="store",
                    dest="gap_symbol",
                    default="-",
                    help="Gap symbol. Default - '-'")

args = parser.parse_args()

for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_file = "%s/%s.position_matrix" % (args.output_dir,
                                             alignment_name_list[1])
    MultipleAlignmentRoutines.get_position_presence_matrix_fom_file(
        alignment_file,
        output_file,
        format=args.format,
        gap_symbol=args.gap_symbol)
Пример #6
0
def expression_hsp(hsp):
    # hit_span - length of hit for single-fragment HSP(blast etc). DO NOT work with exonerate
    return (hsp.evalue <= args.max_e_value) and (hsp.hit_span >= args.min_alignment_length)


def iterator(blast_results):
    for query_id in blast_results:
        filtered_query = blast_results[query_id].hsp_filter(func=expression_hsp)
        if filtered_query:
            yield filtered_query
"""
print("Parsing input file...")
blast_results = SearchIO.index(args.input, args.format)

gi_ids_list = map(lambda x: x.split("|")[1],
                  MultipleAlignmentRoutines.get_db_ids(blast_results))
#print(gi_ids_list)
print("Downloading sequence summaries...")
handle = Entrez.esummary(db=args.db, id=",".join(gi_ids_list))
summaries_list = Entrez.read(handle)
tax_id_list = set()
with open(args.out_prefix + ".taxid", "w") as out_fd:
    for record in summaries_list:
        if "TaxId" in record:
            tax_id_list.add(str(record["TaxId"]))
            out_fd.write(str(record["TaxId"]) + "\n")
print("Downloading species names...")
taxa_handle = Entrez.esummary(db="taxonomy", id=",".join(tax_id_list))
taxa_list = Entrez.read(taxa_handle)
with open(args.out_prefix + ".sciname", "w") as taxa_fd:
    with open(args.out_prefix + ".commonname", "w") as com_fd:
Пример #7
0
import argparse

from Routines import MultipleAlignmentRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="File with alignment")
parser.add_argument("-c",
                    "--coordinates",
                    action="store",
                    dest="coordinates",
                    required=True,
                    help="File with coordinates of gene alignments")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write alignment prepared for Codeml")

args = parser.parse_args()

MultipleAlignmentRoutines.prepare_multigene_alignment_for_codeml(
    args.input, args.coordinates, args.output, format="fasta")
args = parser.parse_args()

unique_position_dict = TwoLvlDict()

FileRoutines.safe_mkdir(args.output_dir)

for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_prefix = "%s/%s.unique_positions" % (args.output_dir,
                                                alignment_name_list[1])

    unique_position_dict[alignment_name_list[
        1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file(
            alignment_file,
            output_prefix,
            format=args.format,
            gap_symbol="-",
            return_mode="relative",
            verbose=False)

species_list = unique_position_dict.sl_keys()

data_dict = OrderedDict()

for species in species_list:
    data_dict[species] = []
    for alignment in unique_position_dict:
        data_dict[species].append(unique_position_dict[alignment][species])

data_list = [data_dict[species] for species in data_dict]
parser.add_argument(
    "-o",
    "--output_directory",
    action="store",
    dest="output_dir",
    default="./",
    help=
    "Output directory to write resulting files. Default - current directory")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignments")
parser.add_argument("-g",
                    "--gap_symbol",
                    action="store",
                    dest="gap_symbol",
                    default="-",
                    help="Gap symbol. Default - '-'")

args = parser.parse_args()

for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_prefix = "%s/%s.unique_positions" % (args.output_dir,
                                                alignment_name_list[1])

    MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file(
        alignment_file, output_prefix, format=args.format, gap_symbol="-")
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import MultipleAlignmentRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="Input file with alignment")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="File to write alignment of degenerate sites")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Alignment format. Default: fasta")
parser.add_argument("-g", "--genetic_code_table", action="store", dest="genetic_code_table", type=int,
                    default=1,
                    help="Genetic code table number")

args = parser.parse_args()

MultipleAlignmentRoutines.extract_degenerate_sites_from_codon_alignment_from_file(args.input, args.output,
                                                                                  genetic_code_table=args.genetic_code_table,
                                                                                  format=args.format)
Пример #11
0
                    required=True,
                    help="Output file with protein alignment")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignments. Default: fasta")
parser.add_argument("-g",
                    "--gap_symbol",
                    action="store",
                    dest="gap_symbol",
                    default="-",
                    help="Gap symbol in alignment. Default: '-'")
parser.add_argument(
    "-t",
    "--genetic_code",
    action="store",
    dest="genetic_code",
    default=1,
    type=int,
    help="Genetic code to use(NCBI tables) . Default: 1(standart)")

args = parser.parse_args()

MultipleAlignmentRoutines.translate_codon_alignment(args.codon_alignment,
                                                    args.protein_alignment,
                                                    format=args.format,
                                                    gap_symbol=args.gap_symbol,
                                                    table=args.genetic_code)