def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
Пример #2
0
    def filter_trf_gff(self, input_gff, output_gff, filtered_out_gff, min_period=None, max_period=None,
                       min_copy_number=None,
                       max_copy_number=None, pattern=None, min_percentage_of_matches=None,
                       max_percentage_of_indels=None, min_entropy=None, max_entropy=None):

        def filtering_expression(gff_description_dict):
            return self.gff_filtering_expression(gff_description_dict, min_period=min_period, max_period=max_period,
                                                 min_copy_number=min_copy_number,
                                                 max_copy_number=max_copy_number,
                                                 pattern=pattern,
                                                 min_percentage_of_matches=min_percentage_of_matches,
                                                 max_percentage_of_indels=max_percentage_of_indels,
                                                 min_entropy=min_entropy,
                                                 max_entropy=max_entropy)

        AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
Пример #3
0
    def filter_trf_gff_by_exact_copy_number(input_gff, output_gff, filtered_out_gff, min_copy_number,
                                            perfect_tandem=False):

        if perfect_tandem:
            def filtering_expression(gff_description_dict):
                if (gff_description_dict["Pattern"] * min_copy_number) in gff_description_dict["seq"]:
                    return True
                return False
        else:
            def filtering_expression(gff_description_dict):

                if gff_description_dict["seq"].count(gff_description_dict["Pattern"]) >= min_copy_number:
                    return True
                return False

        AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
Пример #4
0
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
Пример #5
0
    def __init__(self,
                 record_list=None,
                 primer3_file=None,
                 from_file=True,
                 id_based_location_dict=None,
                 repeat_gff_file=None,
                 id_description_entry="ID"):
        self.general_entry_list = [
            "SEQUENCE_ID", "SEQUENCE_TEMPLATE", "SEQUENCE_TARGET",
            "PRIMER_PICK_LEFT_PRIMER", "PRIMER_PICK_INTERNAL_OLIGO",
            "PRIMER_PICK_RIGHT_PRIMER", "PRIMER_PRODUCT_SIZE_RANGE",
            "PRIMER_LEFT_EXPLAIN", "PRIMER_RIGHT_EXPLAIN",
            "PRIMER_PAIR_EXPLAIN", "PRIMER_LEFT_NUM_RETURNED",
            "PRIMER_RIGHT_NUM_RETURNED", "PRIMER_INTERNAL_NUM_RETURNED",
            "PRIMER_PAIR_NUM_RETURNED"
        ]

        self.primer_entry_prefix_list = ["PRIMER_LEFT", "PRIMER_RIGHT"]
        self.primer_entry_suffix_list = [
            "PENALTY", "SEQUENCE", "TM", "GC_PERCENT", "SELF_ANY_TH",
            "SELF_END_TH", "HAIRPIN_TH", "END_STABILITY"
        ]

        self.primer_pair_prefix_list = ["PRIMER_PAIR"]
        self.primer_pair_suffix_list = [
            "PENALTY", "COMPL_ANY_TH", "COMPL_END_TH", "PRODUCT_SIZE"
        ]

        id_based_dict = id_based_location_dict

        if repeat_gff_file:
            id_based_dict = AnnotationsRoutines.get_id_based_dict_from_gff(
                repeat_gff_file, id_entry=id_description_entry)

        if from_file:
            self.records = []
            with open(primer3_file, "r") as in_fd:
                for line in in_fd:
                    entry_dict = {}
                    lineeee = line
                    while lineeee[0] != "=":
                        line_list = lineeee.strip().split("=")
                        entry_dict[line_list[0]] = line_list[1]
                        lineeee = in_fd.readline()

                    self._add_record(entry_dict,
                                     id_based_location_dict=id_based_dict)

        else:
            self.records = record_list
Пример #6
0
    def alignments_string(self,
                          segment_length=120,
                          left_primer_symbol=">",
                          target_symbol="*",
                          right_primer_symbol="<"):
        string = ""
        string += "#SeqeunceID\t%s\n" % self.id
        string += "#Location"
        if self.chrom:
            string += "\t%s" % self.chrom
            if self.chrom_pos_start and self.chrom_pos_end:
                string += ":%i-%i" % (self.chrom_pos_start, self.chrom_pos_end)
        string += "\n"
        #string += "#Sequence\t%s\n" % self.seq

        for primer_pair in self.primer_pair_list:
            string += "#Primer pair %i\n" % primer_pair.id
            string += "\n"

            location_list = [
                (primer_pair.left_primer.start, primer_pair.left_primer.start +
                 primer_pair.left_primer.length),
                (self.target_start, self.target_start + self.target_len),
                (primer_pair.right_primer.start -
                 primer_pair.right_primer.length + 1,
                 primer_pair.right_primer.start + 1)
            ]
            #print self.id
            #print location_list
            string += AnnotationsRoutines.draw_string_regions(
                self.seq,
                location_list,
                [left_primer_symbol, target_symbol, right_primer_symbol],
                overlap_symbol="#",
                line_per_record=False,
                segment_length=segment_length,
                num_of_spaces=3,
                num_of_space_lines=1,
                empty_symbol=" ")

        return string
Пример #7
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="Input GFF file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument("-s",
                    "--syn_file",
                    action="store",
                    dest="syn_file",
                    required=True,
                    help="File with scaffold synonyms")

args = parser.parse_args()

AnnotationsRoutines.rename_scaffolds_in_gff(args.input_gff, args.syn_file,
                                            args.output_prefix)
Пример #8
0
                    action="store",
                    dest="output",
                    required=True,
                    help="Output gff file")
parser.add_argument("-f",
                    "--feature_type",
                    action="store",
                    dest="feature_type",
                    required=True,
                    help="Feature type to use in gff file")
parser.add_argument("-s",
                    "--source",
                    action="store",
                    dest="source",
                    default="source",
                    help="Source to use in gff file")
parser.add_argument("-d",
                    "--id_prefix",
                    action="store",
                    dest="id_prefix",
                    default="ID",
                    help="Id prefix for gff file")

args = parser.parse_args()

AnnotationsRoutines.convert_bedgraph_to_gff(args.input,
                                            args.output,
                                            args.feature_type,
                                            id_prefix=args.id_prefix,
                                            source=args.source)
Пример #9
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_gff", action="store", dest="input_gff",
                    help="Gff file with annotations to extract")
parser.add_argument("-o", "--output_gff", action="store", dest="output_gff",
                    help="Output gff file with extracted transcripts")
parser.add_argument("-d", "--ids_file", action="store", dest="ids_file",
                    help="File with ids of transcripts to extract")

args = parser.parse_args()

AnnotationsRoutines.extract_transcripts_by_ids(args.input_gff, args.ids_file, args.output_gff)
Пример #10
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="input gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output fixed gff file")

args = parser.parse_args()

AnnotationsRoutines.fix_gff_coordinates_order(args.input_gff, args.output_gff)
Пример #11
0
                    required=True,
                    help="File to write output BED file")
parser.add_argument(
    "-t",
    "--feature_types",
    action="store",
    dest="feature_types",
    type=lambda s: s.split(","),
    default=[],
    help="Comma-separated list of feature types to write in output file "
    "Default: all")
"""
parser.add_argument("-d", "--id_entry", action="store", dest="id_entry", default="ID",
                    help="Id entry. Default: ID")
"""

parser.add_argument("-s",
                    "--scaffold_id_file",
                    action="store",
                    dest="scaffold_id_file",
                    default=None,
                    help="File with IDs of scaffolds to include. Default: All")

args = parser.parse_args()

AnnotationsRoutines.convert_gff_to_simple_bed(
    args.input_gff,
    args.output_bed,
    feature_type_list=args.feature_types,
    scaffold_id_file=args.scaffold_id_file)
Пример #12
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="input gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output fixed gff file")
parser.add_argument("-f",
                    "--feature_type",
                    action="store",
                    dest="feature_type",
                    required=True,
                    help="Feature type to use")
args = parser.parse_args()

AnnotationsRoutines.fix_absent_feature_type_field(args.input_gff,
                                                  args.output_gff,
                                                  args.feature_type)
Пример #13
0
    def primer_prediction_pipeline(self, genome_fasta, output_prefix, trf_gff=None, min_str_period=3, max_str_period=5,
                                   min_copy_number=20, max_copy_number=None, pattern=None, min_perfect_copy_number=20,
                                   require_tandem_perfect_copies=True, left_flank_len=200, right_flank_len=200,
                                   core_seq_coords_entry="core_seq_coords", id_description_entry="ID",
                                   kmer_dir=None, kmer_file_prefix=None, count_kmers=False,
                                   min_percentage_of_matches=None, max_percentage_of_indels=None,
                                   optimal_primer_len=None, min_primer_len=None, max_primer_len=None, max_ns_accepted=None,
                                   softmasked_input=False, optimal_GC=None, min_GC=None, max_GC=None,
                                   optimal_melting_temperature=None, min_melting_temperature=None,
                                   max_melting_temperature=None, black_list_of_seqs_fasta=None,
                                   trf_matching_weight=2, trf_mismatching_penalty=7,
                                   trf_indel_penalty=7, trf_matching_probability=80, trf_indel_probability=10,
                                   trf_min_score=50, trf_max_period_size=500, threads=None, min_gap_len=5):

        TRF.path = self.trf_dir
        TRF.threads = threads if threads else self.threads

        Primer3.path = self.primer3_dir
        Primer3.threads = threads if threads else self.threads

        Glistmaker.path = self.glistmaker_dir
        Glistmaker.threads = threads if threads else self.threads

        trf_output_gff = "%s.with_rep_seqs.gff" % output_prefix if trf_gff is None else trf_gff

        filtered_suffix = ""
        filtered_suffix += ".min_period_%i" % min_str_period if min_str_period else ""
        filtered_suffix += ".max_period_%i" % max_str_period if max_str_period else ""
        filtered_suffix += ".min_copy_%i" % min_copy_number if min_copy_number else ""
        filtered_suffix += ".max_copy_%i" % max_copy_number if max_copy_number else ""
        filtered_suffix += ".pattern_%s" % pattern if pattern else ""

        filtered_trf_gff = "%s%s.gff" % (output_prefix, filtered_suffix)
        filtered_out_trf_gff = "%s%s.filtered_out.gff" % (output_prefix, filtered_suffix)

        final_filtered_gff = filtered_trf_gff

        if min_perfect_copy_number:
            filtering_prefix = "%s%s.%s" % (output_prefix, filtered_suffix,
                                                            "min_tandem_perfect_copy_%i" % min_perfect_copy_number if require_tandem_perfect_copies else "min_perfect_copy_%i" % min_perfect_copy_number)

            final_filtered_gff = "%s.gff" % filtering_prefix
            filtered_out_exact_copy_trf_gff = "%s.filtered_out.gff" % filtering_prefix
            #final_filtered_gff = filtered_exact_copy_trf_gff

        final_filtered_len_file = "%s.monomer_len.len" % final_filtered_gff[:-4]

        with_flanks_prefix = "%s.with_flanks" % final_filtered_gff[:-4]
        with_flanks_gff = "%s.gff" % with_flanks_prefix
        with_flanks_fasta = "%s.fasta" % with_flanks_prefix

        primer3_output_prefix = "%s.primer3" % with_flanks_prefix

        if trf_gff is None:
            print("Annotating repeats...")
            trf_report = TRF.parallel_search_tandem_repeat(genome_fasta, output_prefix,
                                                           matching_weight=trf_matching_weight,
                                                           mismatching_penalty=trf_mismatching_penalty,
                                                           indel_penalty=trf_indel_penalty,
                                                           match_probability=trf_matching_probability,
                                                           indel_probability=trf_indel_probability,
                                                           min_alignment_score=trf_min_score,
                                                           max_period=trf_max_period_size,
                                                           report_flanking_sequences=False,
                                                           max_len_per_file=1000000,
                                                           store_intermediate_files=False)
        print("Filtering repeats...")
        TRF.filter_trf_gff(trf_output_gff, filtered_trf_gff, filtered_out_trf_gff, min_period=min_str_period,
                           max_period=max_str_period, min_copy_number=min_copy_number, max_copy_number=max_copy_number,
                           pattern=pattern, min_percentage_of_matches=min_percentage_of_matches,
                           max_percentage_of_indels=max_percentage_of_indels, min_entropy=None, max_entropy=None)

        id_based_location_dict = AnnotationsRoutines.get_id_based_dict_from_gff(trf_output_gff,
                                                                                id_entry=id_description_entry) if trf_gff else trf_report.get_id_based_dict()

        if min_perfect_copy_number:
            TRF.filter_trf_gff_by_exact_copy_number(filtered_trf_gff, final_filtered_gff,
                                                    filtered_out_exact_copy_trf_gff, min_perfect_copy_number,
                                                    perfect_tandem=require_tandem_perfect_copies)

        #print final_filtered_gff
        #print final_filtered_len_file
        TRF.get_monomer_len_file_from_trf_gff(final_filtered_gff, final_filtered_len_file)

        monomer_length_id_file_prefix = "%s.monomer_len" % final_filtered_gff[:-4]
        monomer_length_id_dict = self.split_ids_from_len_file_by_len(final_filtered_len_file,
                                                                     monomer_length_id_file_prefix,
                                                                     len_column=1, id_column=0)

        AnnotationsRoutines.add_flanks_to_gff_record(final_filtered_gff, with_flanks_prefix,
                                                     left_flank_len, right_flank_len, genome_fasta,
                                                     coords_description_entry=core_seq_coords_entry,
                                                     id_description_entry=id_description_entry)

        AnnotationsRoutines.extract_sequences_by_gff(genome_fasta,
                                                     with_flanks_gff,
                                                     with_flanks_fasta,
                                                     type_list="repeat",
                                                     parsing_mode="parse",
                                                     format="fasta")

        if count_kmers:
            print("Counting kmers...")
            if (not kmer_file_prefix) or (not kmer_dir):
                raise ValueError("No kmer file prefix of kmer directory was set")
            glistmaker_prefix = "%s/%s" % (kmer_dir, kmer_file_prefix)
            self.safe_mkdir(kmer_dir)
            Glistmaker.generate_kmer_lists_for_primer3(genome_fasta, glistmaker_prefix, threads=None,
                                                       max_tmp_table_number=None, max_tmp_table_size=None)
        print("Generating primers...")
        for human_readable_output in False, True:
            output_file_prefix = "%s.human_readable" % with_flanks_prefix if human_readable_output else with_flanks_prefix
            self.predict_primers(with_flanks_gff, with_flanks_fasta, output_file_prefix,
                                 kmer_dir, kmer_file_prefix, pcr_product_size_range=None,
                                 optimal_primer_len=optimal_primer_len,
                                 min_primer_len=min_primer_len, max_primer_len=max_primer_len,
                                 max_ns_accepted=max_ns_accepted,
                                 softmasked_input=softmasked_input,
                                 optimal_GC=optimal_GC, min_GC=min_GC, max_GC=max_GC,
                                 optimal_melting_temperature=optimal_melting_temperature,
                                 min_melting_temperature=min_melting_temperature,
                                 max_melting_temperature=max_melting_temperature,
                                 black_list_of_seqs_fasta=black_list_of_seqs_fasta,
                                 thermodynamic_parameters_dir=self.primer3_thermo_config_dir,
                                 format_output=human_readable_output,
                                 relative_core_seq_coords_relative_entry="%s_relative" % core_seq_coords_entry)

        primer3_output_file = "%s.out" % primer3_output_prefix

        filtered_results_file = "%s.filtered.res" % primer3_output_prefix
        filtered_results_table_form_file = "%s.filtered.table_form.res" % primer3_output_prefix
        filtered_results_table_form_with_aln_file = "%s.filtered.table_form_with_aln.res" % primer3_output_prefix
        filtered_out_results_file = "%s.filtered_out.res" % primer3_output_prefix

        primer3_results = CollectionPrimer3(primer3_file=primer3_output_file, from_file=True, id_based_location_dict=id_based_location_dict)

        primer3_results.remove_primers_with_gaps_in_pcr_product(min_gap_len)
        primer3_filtered_results, primer3_filtered_out_results = primer3_results.filter_out_records_without_primers()

        primer3_filtered_results.write(filtered_results_file)
        primer3_filtered_results.write_table_form(filtered_results_table_form_file)
        primer3_filtered_results.write_table_form_with_alignments(filtered_results_table_form_with_aln_file)
        primer3_filtered_out_results.write(filtered_out_results_file)

        filtered_results_file_splited_by_len_prefix = "%s.filtered.monomer_len" % primer3_output_prefix

        stat_fd = open("%s.stats" % output_prefix, "w")

        sorted_monomer_length_list = map(str, sorted(map(int, monomer_length_id_dict.keys())))

        for monomer_length in sorted_monomer_length_list:
            primer3_monomer_len_results = primer3_filtered_results.extract_records_by_ids(monomer_length_id_dict[monomer_length])
            primer3_monomer_len_results.write("%s.%s.res" % (filtered_results_file_splited_by_len_prefix, monomer_length))
            primer3_monomer_len_results.write_table_form("%s.%s.table_form.res" % (filtered_results_file_splited_by_len_prefix, monomer_length))
            primer3_monomer_len_results.write_table_form_with_alignments("%s.%s.table_form_with_aln.res" % (filtered_results_file_splited_by_len_prefix, monomer_length))

            primer3_monomer_len_results.write_table_form2("%s.%s.table_form2.res" % (filtered_results_file_splited_by_len_prefix, monomer_length))
            primer3_monomer_len_results.write_table_form2_short("%s.%s.table_form2_short.res" % (filtered_results_file_splited_by_len_prefix, monomer_length))

            stat_string = "STR monomer length %s bp: %i repeats with primers" % (str(monomer_length), len(primer3_monomer_len_results.records))
            print(stat_string)

            stat_fd.write(stat_string + "\n")

        stat_fd.close()
Пример #14
0
                    default="Alias",
                    help="Name of field in gff description to add aliases. "
                    "If this field is absent it will be created."
                    "Default: Alias")
parser.add_argument("-k",
                    "--key_column",
                    action="store",
                    dest="key_column",
                    type=int,
                    default=0,
                    help="Key column in synonym file(0-based). Default: 0")
parser.add_argument("-v",
                    "--value_column",
                    action="store",
                    dest="value_column",
                    type=int,
                    default=1,
                    help="Value column in synonym file(0-based). Default: 1")

args = parser.parse_args()

AnnotationsRoutines.add_alias_to_feature(
    args.input_gff,
    args.output_gff,
    args.syn_file,
    feature_type_list=args.feature_types,
    name_field_list=args.feature_name_fields,
    alias_field=args.alias_field,
    key_column=args.key_column,
    value_column=args.value_column)
Пример #15
0
                    "--feature_types",
                    action="store",
                    dest="feature_types",
                    type=lambda s: s.split(","),
                    default=[
                        "mRNA",
                    ],
                    help="Comma-separated list of feature types to count. "
                    "Default: mRNA")
parser.add_argument("-d",
                    "--id_entry",
                    action="store",
                    dest="id_entry",
                    default="ID",
                    help="Id entry. Default: ID")
parser.add_argument("-p",
                    "--parental_id_entry",
                    action="store",
                    dest="parental_id_entry",
                    default="Parent",
                    help="Parental id entry. Default: Parent")

args = parser.parse_args()

AnnotationsRoutines.get_feature_to_parent_correspondence_from_gff(
    args.input_gff,
    args.output,
    feature_list=args.feature_types,
    id_entry=args.id_entry,
    parental_id_entry=args.parental_id_entry)
Пример #16
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="Input .gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output .gff file")
parser.add_argument("-s",
                    "--syn_file_file",
                    action="store",
                    dest="syn_file",
                    required=True,
                    help="File with synonyms of region names")

args = parser.parse_args()

AnnotationsRoutines.replace_region_names_in_gff(args.input_gff, args.syn_file,
                                                args.output_gff)
Пример #17
0
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    help="Input .gff file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    help="Output prefix")
parser.add_argument(
    "-t",
    "--feature_types",
    action="store",
    dest="feature_types",
    type=lambda s: s.split(","),
    default=None,
    help="Comma-separated list of feature types to add aliases."
    "Default: all feature types")

args = parser.parse_args()

AnnotationsRoutines.get_feature_length_distribution_from_gff(
    args.input_gff, args.output_prefix, feature_list=args.feature_types)
Пример #18
0
parser.add_argument(
    "-e",
    "--end_column_id",
    action="store",
    dest="end_column_id",
    type=int,
    default=2,
    help="0-based index of column with feature end. Default: 2")
parser.add_argument(
    "-n",
    "--coordinates_type",
    action="store",
    dest="coordinates_type",
    default="1-based",
    help="Type of coordinates. Allowed: 0-based, 1-based(default)")

args = parser.parse_args()

AnnotationsRoutines.merge_overlapping_feature_in_simple_format(
    args.input,
    args.scaffold_column_id,
    args.start_column_id,
    args.end_column_id,
    output_file=args.output,
    output_separator="\t",
    comments_prefix="#",
    input_separator="\t",
    coordinates_type=args.coordinates_type,
    return_seqfeature_dict=False,
    feature_type=None)
Пример #19
0
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-c",
                    "--correspondence_file",
                    action="store",
                    dest="correspondence_file",
                    required=True,
                    help="File with correspondence of transcripts to genes")
parser.add_argument("-l",
                    "--length_file",
                    action="store",
                    dest="length_file",
                    required=True,
                    help="Length file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Output prefix")

args = parser.parse_args()

AnnotationsRoutines.add_length_to_accordance_file(args.correspondence_file,
                                                  args.length_file,
                                                  args.output_prefix)
Пример #20
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import sys
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-g",
                    "--gff",
                    action="store",
                    dest="gff",
                    required=True,
                    help="Gff file")

parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    help="Output file with ids. Default: stdout")

args = parser.parse_args()

if args.output is None:
    args.output = sys.stdout

AnnotationsRoutines.get_scaffold_ids_from_gff(args.gff, out_file=args.output)
Пример #21
0
parser.add_argument("-g",
                    "--gff",
                    action="store",
                    dest="gff",
                    required=True,
                    help="Gff file")
parser.add_argument(
    "-f",
    "--features",
    action="store",
    dest="features",
    default=[],
    type=lambda s: s.split(","),
    help="Comma-separated list of features to count per scaffold. "
    "If not set all features will be counted")
parser.add_argument(
    "-o",
    "--output",
    action="store",
    dest="output",
    help="Output file with counts of features. Default: stdout")

args = parser.parse_args()

if args.output is None:
    args.output = sys.stdout

AnnotationsRoutines.count_per_scaffold_feature_number(
    args.gff, out_file=args.output, feature_type_list=args.features)
Пример #22
0
parser.add_argument("-f",
                    "--value_file",
                    action="store",
                    dest="value_file",
                    required=True,
                    help="Value with values to seek for")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output .gff file")
parser.add_argument(
    "-d",
    "--description_fields",
    action="store",
    dest="field_id_list",
    type=lambda s: s.split(","),
    required=True,
    help="Comma-separated list of fields in gff description to check")

args = parser.parse_args()

value_list = IdList(filename=args.value_file)
AnnotationsRoutines.extract_gff_records_by_description_value(
    args.input_gff,
    args.output_gff,
    args.field_id_list,
    value_list,
    retain_comments=False)
Пример #23
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-g",
                    "--gtf_file",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input gtf file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output accordance file")

args = parser.parse_args()

AnnotationsRoutines.get_transcript_to_pep_accordance_from_gtf(
    args.input, args.output, comment_symbol="#")
Пример #24
0
    action="store",
    dest="separator",
    default="_",
    help="Separator in chunk filename."
    "Chunks must be named as <prefix><separator><chunk number><suffix> . "
    "Default: '_'")
parser.add_argument("-n",
                    "--total_number_of_chunks",
                    action="store",
                    dest="number_of_chunks",
                    type=int,
                    required=True,
                    help="Total number of chunks")
parser.add_argument("-m",
                    "--min_chunk_size",
                    action="store",
                    dest="min_chunk_size",
                    type=int,
                    required=True,
                    help="Minimum size of chunk file.")

args = parser.parse_args()

AnnotationsRoutines.check_chunks(
    args.chunk_dir,
    args.number_of_chunks,
    args.min_chunk_size,
    separator=args.separator,
    chunk_filename_suffix=args.chunk_filename_suffix,
    chunk_filename_prefix=args.chunk_filename_prefix)
Пример #25
0
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    help="Input .gff file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    help="Output prefix")
parser.add_argument(
    "-t",
    "--feature_types",
    action="store",
    dest="feature_types",
    type=lambda s: s.split(","),
    default=None,
    help="Comma-separated list of feature types to add aliases. "
    "Default: all feature types")

args = parser.parse_args()

AnnotationsRoutines.count_total_feature_length_from_gff(
    args.input_gff, args.output_prefix, features_to_count=args.feature_types)
Пример #26
0
                    dest="gff_file",
                    help="Gff file with annotations to extract")
parser.add_argument("-p",
                    "--parsing_mode",
                    action="store",
                    dest="parsing_mode",
                    default="parse",
                    help="Parsing mode for input sequence file. "
                    "Possible variants: 'index_db', 'index'(default), 'parse'")

args = parser.parse_args()

AnnotationsRoutines.extract_sequences_by_gff(args.input,
                                             args.gff_file,
                                             args.output,
                                             type_list=args.type,
                                             parsing_mode=args.parsing_mode,
                                             tmp_index_file="temp.idx",
                                             format=args.format)
"""
tmp_index_file = "temp.idx"
args.type = args.type.split(",")


annotations_dict = SeqIO.to_dict(GFF.parse(open(args.gff_file)))
print annotations_dict
print("Parsing %s..." % args.input)
sequence_dict = SequenceRoutines.parse_seq_file(args.input, args.parsing_mode, args.format, index_file=tmp_index_file ) # SeqIO.index_db(tmp_index_file, args.input_file, format=args.format)


SeqIO.write(SequenceRoutines.record_generator(annotations_dict, sequence_dict, args.type), args.output, format=args.format)
Пример #27
0
    default="core_seq_coords",
    help=
    "Key for description entry with coordinates of core sequence in new feature"
)

parser.add_argument("-l",
                    "--left_flank_len",
                    action="store",
                    dest="left_flank_len",
                    type=int,
                    default=200,
                    help="Length of left flank. Default: 200")
parser.add_argument("-r",
                    "--right_right_len",
                    action="store",
                    dest="right_flank_len",
                    type=int,
                    default=200,
                    help="Length of right flank. Default: 200")

args = parser.parse_args()

AnnotationsRoutines.add_flanks_to_gff_record(
    args.input_gff,
    args.output_prefix,
    args.left_flank_len,
    args.right_flank_len,
    args.fasta,
    coords_description_entry=args.coords_description_entry,
    id_description_entry=args.id_description_entry)
Пример #28
0
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, "%s.ids" %
        output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence,
        output_swissprot_pfam_or_hints_supported_transcripts_longest_pep)
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, "%s.ids" %
        output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence,
        output_swissprot_pfam_and_hints_supported_transcripts_longest_pep)

    for id_file in output_swissprot_pfam_or_hints_supported_transcripts_ids, \
                   output_swissprot_pfam_and_hints_supported_transcripts_ids, \
                   "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                   "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence:
        out_pref = id_file[:-4]
        out_gff = "%s.gff" % out_pref
        AnnotationsRoutines.extract_transcripts_by_ids(output_gff, id_file,
                                                       out_gff)
        for suffix in ".trimmed.cds", ".transcript":
            SequenceRoutines.extract_sequence_by_ids(
                "%s%s" % (args.output, suffix), id_file,
                "%s%s" % (out_pref, suffix))

    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_or_hints_supported_transcripts_ids,
        cds_with_inframe_stop_codons_ids,
        output_swissprot_pfam_or_hints_supported_transcripts_inframe_stop_ids,
        mode="common")

    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_and_hints_supported_transcripts_ids,
        cds_with_inframe_stop_codons_ids,
        output_swissprot_pfam_and_hints_supported_transcripts_inframe_stop_ids,
Пример #29
0
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Output .gff file")

parser.add_argument("-t",
                    "--feature_types",
                    action="store",
                    dest="feature_types",
                    type=lambda s: s.split(","),
                    default=["CDS"],
                    help="Comma-separated list of feature types to extract. "
                    "Default: CDS only")

parser.add_argument(
    "-u",
    "--unification_key",
    action="store",
    dest="unification_key",
    default="Parent",
    help="Annotation entry to use for unification. Default: Parent")

args = parser.parse_args()

AnnotationsRoutines.get_feature_dict(args.input_gff,
                                     output_prefix=args.output_prefix,
                                     feature_type_list=args.feature_types,
                                     unification_key=args.unification_key)