示例#1
0
    def filter_trf_gff(self,
                       input_gff,
                       output_gff,
                       filtered_out_gff,
                       min_period=None,
                       max_period=None,
                       min_copy_number=None,
                       max_copy_number=None,
                       pattern=None,
                       min_percentage_of_matches=None,
                       max_percentage_of_indels=None,
                       min_entropy=None,
                       max_entropy=None):
        def filtering_expression(gff_description_dict):
            return self.gff_filtering_expression(
                gff_description_dict,
                min_period=min_period,
                max_period=max_period,
                min_copy_number=min_copy_number,
                max_copy_number=max_copy_number,
                pattern=pattern,
                min_percentage_of_matches=min_percentage_of_matches,
                max_percentage_of_indels=max_percentage_of_indels,
                min_entropy=min_entropy,
                max_entropy=max_entropy)

        AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff,
                                                      filtered_out_gff,
                                                      filtering_expression)
示例#2
0
    def filter_trf_gff_by_exact_copy_number(input_gff,
                                            output_gff,
                                            filtered_out_gff,
                                            min_copy_number,
                                            perfect_tandem=False):

        if perfect_tandem:

            def filtering_expression(gff_description_dict):
                if (gff_description_dict["Pattern"] *
                        min_copy_number) in gff_description_dict["seq"]:
                    return True
                return False
        else:

            def filtering_expression(gff_description_dict):

                if gff_description_dict["seq"].count(
                        gff_description_dict["Pattern"]) >= min_copy_number:
                    return True
                return False

        AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff,
                                                      filtered_out_gff,
                                                      filtering_expression)
示例#3
0
    def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
示例#4
0
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(
                    line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
示例#5
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    help="Gff file with annotations to extract")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    help="Output gff file with extracted transcripts")
parser.add_argument("-d",
                    "--ids_file",
                    action="store",
                    dest="ids_file",
                    help="File with ids of transcripts to extract")

args = parser.parse_args()

AnnotationsRoutines.extract_transcripts_by_ids(args.input_gff, args.ids_file,
                                               args.output_gff)
示例#6
0
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, "%s.ids" %
        output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence,
        output_swissprot_pfam_or_hints_supported_transcripts_longest_pep)
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, "%s.ids" %
        output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence,
        output_swissprot_pfam_and_hints_supported_transcripts_longest_pep)

    for id_file in output_swissprot_pfam_or_hints_supported_transcripts_ids, \
                   output_swissprot_pfam_and_hints_supported_transcripts_ids, \
                   "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                   "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence:
        out_pref = id_file[:-4]
        out_gff = "%s.gff" % out_pref
        AnnotationsRoutines.extract_transcripts_by_ids(output_gff, id_file,
                                                       out_gff)
        for suffix in ".trimmed.cds", ".transcript":
            SequenceRoutines.extract_sequence_by_ids(
                "%s%s" % (args.output, suffix), id_file,
                "%s%s" % (out_pref, suffix))

    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_or_hints_supported_transcripts_ids,
        cds_with_inframe_stop_codons_ids,
        output_swissprot_pfam_or_hints_supported_transcripts_inframe_stop_ids,
        mode="common")

    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_and_hints_supported_transcripts_ids,
        cds_with_inframe_stop_codons_ids,
        output_swissprot_pfam_and_hints_supported_transcripts_inframe_stop_ids,
示例#7
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="input gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output fixed gff file")

args = parser.parse_args()

AnnotationsRoutines.fix_gff_coordinates_order(args.input_gff, args.output_gff)
示例#8
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import os

import argparse

from Bio import SeqIO

from Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-g",
                    "--gtf_file",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input gtf file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output accordance file")

args = parser.parse_args()

AnnotationsRoutines.get_transcript_to_pep_accordance_from_gtf(
    args.input, args.output, comment_symbol="#")
__author__ = 'Sergei F. Kliver'
import argparse

from Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="Input .gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output .gff file")
parser.add_argument("-s",
                    "--syn_file_file",
                    action="store",
                    dest="syn_file",
                    required=True,
                    help="File with synonyms of region names")

args = parser.parse_args()

AnnotationsRoutines.replace_region_names_in_gff(args.input_gff, args.syn_file,
                                                args.output_gff)
示例#10
0
__author__ = 'Sergei F. Kliver'
import argparse

from Routines import AnnotationsRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gff",
                    action="store",
                    dest="input_gff",
                    required=True,
                    help="input gff file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output fixed gff file")
parser.add_argument("-f",
                    "--feature_type",
                    action="store",
                    dest="feature_type",
                    required=True,
                    help="Feature type to use")
args = parser.parse_args()

AnnotationsRoutines.fix_absent_feature_type_field(args.input_gff,
                                                  args.output_gff,
                                                  args.feature_type)