示例#1
0
final_gff = "%s.final.gff" % args.output
final_CDS_gff = "%s.final.CDS.gff" % args.output

AUGUSTUS.path = args.augustus_dir
AUGUSTUS.threads = args.threads

print("Annotating genes...")

AUGUSTUS.parallel_predict(args.species,
                          args.input,
                          output_raw_gff,
                          strand=args.strand,
                          gene_model=args.gene_model,
                          output_gff3=True,
                          other_options=args.other_options,
                          config_dir=args.config_dir,
                          use_softmasking=args.softmasking,
                          hints_file=args.hintsfile,
                          extrinsicCfgFile=args.extrinsicCfgFile,
                          predict_UTR=args.predict_UTR,
                          parsing_mode="parse")

AUGUSTUS.replace_augustus_ids(output_raw_gff,
                              args.output,
                              species_prefix=args.species_prefix,
                              number_of_digits_in_id=8)

Gffread.extract_transcript_sequences(output_gff, args.input, args.output)

SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
示例#2
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Tools.Annotation import AUGUSTUS


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="File with junctions from STAR output")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="File to write gff with intron hints")
parser.add_argument("-m", "--min_supporting_uniquely_mapped_reads", action="store", dest="min_supporting_uniquely_mapped_reads", default=1,
                    type=int,
                    help="Minimum number of uniquely mapped reads supporting reads to retain junction. "
                         "Default: 1, i.e. only junctions supported by at least one uniquely mapped read will be "
                         ".To retain all junctions set to 0")
parser.add_argument("-s", "--source", action="store", dest="source", default="RNASEQ",
                    help="Source of hints. Default: RNASEQ")
parser.add_argument("-p", "--priority", action="store", dest="priority", default=100, type=int,
                    help="Priority of hints. Default: 100")
args = parser.parse_args()

AUGUSTUS.convert_star_junctions_to_intron_hints(args.input, args.output,
                                                min_supporting_uniquely_mapped_reads=args.min_supporting_uniquely_mapped_reads,
                                                source=args.source, priority=100)
示例#3
0
input_other_top_gff = "%s.other_top.target.gff" % args.input_prefix
input_other_secondary_gff = "%s.other_secondary.target.gff" % args.input_prefix

precise_top_hints = "%s.precise_top.target.hints.gff" % args.output_prefix
precise_secondary_hints = "%s.precise_secondary.target.hints.gff" % args.output_prefix

other_top_hints = "%s.other_top.target.hints.gff" % args.output_prefix
other_secondary_hints = "%s.other_secondary.target.hints.gff" % args.output_prefix

AUGUSTUS.path = args.augustus_script_dir

AUGUSTUS.exonerate_to_hints(input_precise_top_gff,
                            precise_top_hints,
                            priority=args.full_top_hits_priority,
                            min_intron_len=args.min_intron_len,
                            max_intron_len=args.max_intron_len,
                            CDS_part_cutoff=args.full_top_hits_CDS_part_cutoff,
                            source=args.source_for_full_top_hits,
                            with_utrs=args.include_utr_hints)

AUGUSTUS.exonerate_to_hints(
    input_precise_secondary_gff,
    precise_secondary_hints,
    priority=args.full_secondary_hits_priority,
    min_intron_len=args.min_intron_len,
    max_intron_len=args.max_intron_len,
    CDS_part_cutoff=args.full_secondary_hits_CDS_part_cutoff,
    source=args.source_for_full_secondary_hits,
    with_utrs=args.include_utr_hints)

AUGUSTUS.exonerate_to_hints(
示例#4
0
    def predict_genes(self,
                      output_prefix,
                      annotation_species_prefix,
                      genome_fasta,
                      augustus_species,
                      output_directory="./",
                      augustus_strand=None,
                      augustus_gene_model=None,
                      augustus_config_dir=None,
                      augustus_use_softmasking=None,
                      augustus_other_options="",
                      augustus_hintsfile=None,
                      augustus_extrinsicCfgFile=None,
                      augustus_predict_UTR=None,
                      augustus_min_intron_len=None,
                      threads=1,
                      augustus_dir="",
                      hmmer_dir="",
                      blast_dir="",
                      stop_codons_list=("TGA", "TAA", "TAG"),
                      genetic_code_table=1):

        draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix)

        augustus_splited_input_dir = "%s/splited_input/" % output_directory
        augustus_splited_output_dir = "%s/splited_output_dir" % output_directory

        output_raw_gff = "%s.raw.gff" % draft_file_prefix
        output_gff = "%s.renamed.gff" % draft_file_prefix
        augustus_pep = "%s.pep" % draft_file_prefix

        AUGUSTUS.path = augustus_dir
        AUGUSTUS.threads = threads
        HMMER3.path = hmmer_dir
        HMMER3.threads = threads
        BLASTp.path = blast_dir
        BLASTp.threads = threads

        print("Annotating genes...")
        AUGUSTUS.parallel_predict(
            augustus_species,
            genome_fasta,
            output_raw_gff,
            strand=augustus_strand,
            gene_model=augustus_gene_model,
            output_gff3=True,
            other_options=augustus_other_options,
            config_dir=augustus_config_dir,
            use_softmasking=augustus_use_softmasking,
            hints_file=augustus_hintsfile,
            split_dir=augustus_splited_input_dir,
            splited_output_dir=augustus_splited_output_dir,
            extrinsicCfgFile=augustus_extrinsicCfgFile,
            predict_UTR=augustus_predict_UTR,
            combine_output_to_single_file=True,
            min_intron_len=augustus_min_intron_len)

        #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8):

        AUGUSTUS.replace_augustus_ids(output_raw_gff,
                                      draft_file_prefix,
                                      species_prefix=annotation_species_prefix,
                                      number_of_digits_in_id=8)
        #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False)
        gffread_file_prefix = "%s.gffread" % draft_file_prefix
        gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences(
            output_gff, genome_fasta, gffread_file_prefix)
        gffread_trimmed_cds = ".".join(
            gffread_cds_file.split(".")[:-1]) + ".trimmed.cds"
        gffread_trimmed_pep = ".".join(
            gffread_pep_file.split(".")[:-1]) + ".trimmed.pep"
        self.trim_cds_and_remove_terminal_stop_codons(
            gffread_cds_file,
            gffread_trimmed_cds,
            stop_codons_list=stop_codons_list
        )  # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins
        inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix
        self.translate_sequences_from_file(
            gffread_trimmed_cds,
            gffread_trimmed_pep,
            format="fasta",
            id_expression=None,
            genetic_code_table=genetic_code_table,
            translate_to_stop=False,
            prefix_of_file_inframe_stop_codons_seqsin=
            inframe_stop_codons_file_prefix)  # Universal code !!!

        AUGUSTUS.extract_gene_ids_from_output(output_gff,
                                              all_annotated_genes_ids)
        AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)

        print("Extracting peptides...")

        AUGUSTUS.extract_proteins_from_output(
            output_gff,
            output_pep,
            id_prefix="",
            evidence_stats_file=output_evidence_stats,
            supported_by_hints_file=output_supported_stats)

        self.compare_sequences_from_files(output_pep,
                                          "%s.trimmed.pep" % args.output,
                                          "comparison_of_peptides",
                                          format="fasta",
                                          verbose=True)

        os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" %
                  (output_supported_stats, output_supported_stats_ids))

        print("Annotating domains(Pfam database)...")

        HMMER3.parallel_hmmscan(
            args.pfam_db,
            output_pep,
            output_hmmscan,
            num_of_seqs_per_scan=None,
            split_dir="splited_hmmscan_fasta/",
            splited_output_dir="splited_hmmscan_output_dir",
            tblout_outfile=None,
            domtblout_outfile=output_domtblout,
            pfamtblout_outfile=None,
            splited_tblout_dir=None,
            splited_domtblout_dir="hmmscan_domtblout/")
        HMMER3.extract_dom_ids_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_ids)
        hits_dict = HMMER3.extract_dom_names_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_pfam_supported_transcripts_ids)
        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_pfam_supported_transcripts_ids,
            output_pfam_supported_genes_ids)
        os.system(remove_transcript_ids_str)

        print("Annotating peptides(Swissprot database)...")

        BLASTp.parallel_blastp(output_pep,
                               args.swissprot_db,
                               evalue=0.0000001,
                               output_format=6,
                               outfile=output_swissprot_blastp_hits,
                               split_dir="splited_blastp_fasta",
                               splited_output_dir="splited_blastp_output_dir")
        hits_dict = BLASTp.extract_hits_from_tbl_output(
            output_swissprot_blastp_hits, output_swissprot_blastp_hits_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_swissprot_supported_transcripts_ids)

        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_swissprot_supported_transcripts_ids,
            output_swissprot_supported_genes_ids)
        os.system(remove_transcript_ids_str)
        """
示例#5
0
    def prepare_rnaseq_hints(self,
                             rnaseq_read_dir,
                             rnaseq_alignment_dir,
                             genome_dir,
                             hints_dir,
                             genome_fasta=None,
                             samples=None,
                             annotation_gtf=None,
                             sjdboverhang=None,
                             genomeSAindexNbases=None,
                             genomeChrBinNbits=None,
                             genome_size=None,
                             feature_from_gtf_to_use_as_exon=None,
                             exon_tag_to_use_as_transcript_id=None,
                             exon_tag_to_use_as_gene_id=None,
                             length_of_sequences_flanking_junction=None,
                             junction_tab_file_list=None,
                             three_prime_trim=None,
                             five_prime_trim=None,
                             adapter_seq_for_three_prime_clip=None,
                             max_mismatch_percent_for_adapter_trimming=None,
                             three_prime_trim_after_adapter_clip=None,
                             output_type="BAM",
                             sort_bam=True,
                             max_memory_for_bam_sorting=8000000000,
                             include_unmapped_reads_in_bam=True,
                             output_unmapped_reads=True,
                             two_pass_mode=True,
                             max_intron_length=None,
                             min_reads_supporting_junction_hint=1,
                             source="RNASEQ",
                             priority=100,
                             threads=1,
                             STAR_path=""):
        STAR.threads = threads
        STAR.path = STAR_path

        samples = samples if samples else self.get_sample_list(rnaseq_read_dir)

        STAR.align_samples(
            rnaseq_read_dir,
            rnaseq_alignment_dir,
            genome_dir,
            genome_fasta=genome_fasta,
            samples=samples,
            annotation_gtf=annotation_gtf,
            sjdboverhang=sjdboverhang,
            genomeSAindexNbases=genomeSAindexNbases,
            genomeChrBinNbits=genomeChrBinNbits,
            genome_size=genome_size,
            feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
            exon_tag_to_use_as_transcript_id=exon_tag_to_use_as_transcript_id,
            exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
            length_of_sequences_flanking_junction=
            length_of_sequences_flanking_junction,
            junction_tab_file_list=junction_tab_file_list,
            three_prime_trim=three_prime_trim,
            five_prime_trim=five_prime_trim,
            adapter_seq_for_three_prime_clip=adapter_seq_for_three_prime_clip,
            max_mismatch_percent_for_adapter_trimming=
            max_mismatch_percent_for_adapter_trimming,
            three_prime_trim_after_adapter_clip=
            three_prime_trim_after_adapter_clip,
            output_type=output_type,
            sort_bam=sort_bam,
            max_memory_for_bam_sorting=max_memory_for_bam_sorting,
            include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
            output_unmapped_reads=output_unmapped_reads,
            two_pass_mode=two_pass_mode,
            max_intron_length=max_intron_length)

        for sample in samples:
            sample_junction_file = "%s/%s/SJ.out.tab " % (rnaseq_alignment_dir,
                                                          sample)
            sample_hint_gff = "%s/rnaseq.hints.%s.gff" % (hints_dir, sample)
            AUGUSTUS.convert_star_junctions_to_intron_hints(
                sample_junction_file,
                sample_hint_gff,
                min_supporting_reads=min_reads_supporting_junction_hint,
                source=source,
                priority=priority)
示例#6
0
                    help="Output file with synonyms")
parser.add_argument("-p",
                    "--id_prefix",
                    action="store",
                    dest="id_prefix",
                    required=True,
                    help="Prefix of id")
parser.add_argument("-n",
                    "--number_of_digits_in_number",
                    action="store",
                    dest="number_of_digits_in_number",
                    type=int,
                    default=8,
                    help="Number of digits in id. Default - 8")
parser.add_argument(
    "-f",
    "--feature_type",
    action="store",
    dest="feature_type",
    default="gene",
    help="Type of feature to assign synonyms. Default - 'gene'")

args = parser.parse_args()

AUGUSTUS.assign_synonyms_to_features_from_augustus_gff(
    args.input_gff,
    args.output,
    args.id_prefix,
    number_of_digits_in_number=args.number_of_digits_in_number,
    feature_type=args.feature_type)
示例#7
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse
from RouToolPa.Tools.Annotation import AUGUSTUS

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input evidence file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")

args = parser.parse_args()

AUGUSTUS.draw_evidence_figures(args.input, args.output_prefix)
示例#8
0
                    dest="input_gff",
                    required=True,
                    help="Input gff from AUGUSTUS")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument("-p",
                    "--species_prefix",
                    action="store",
                    dest="species_prefix",
                    required=True,
                    help="Species prefix to use in ids")
parser.add_argument("-n",
                    "--number_of_digits_in_number",
                    action="store",
                    dest="number_of_digits_in_number",
                    type=int,
                    default=8,
                    help="Number of digits in id. Default - 8")

args = parser.parse_args()

AUGUSTUS.assign_synonyms_to_annotations_from_augustus_gff(
    args.input_gff,
    args.output_prefix,
    args.species_prefix,
    number_of_digits_in_number=args.number_of_digits_in_number)
示例#9
0
secondary_hits_gff = "%s.target.secondary_hits.gff" % args.output_prefix
top_hits_gff_hints = "%s.target.top_hits.hints.gff" % args.output_prefix
secondary_hits_gff_hints = "%s.target.secondary_hits.hints.gff" % args.output_prefix

Exonerate.extract_top_hits_from_target_gff(
    args.input,
    top_hits_gff,
    secondary_hits_gff,
    id_white_list_file=args.white_id_file,
    max_hits_per_query=args.max_hits_per_query)

AUGUSTUS.path = args.augustus_script_dir
AUGUSTUS.exonerate_to_hints(top_hits_gff,
                            top_hits_gff_hints,
                            priority=args.top_hits_priority,
                            min_intron_len=args.min_intron_len,
                            max_intron_len=args.max_intron_len,
                            CDS_part_cutoff=args.top_hits_CDS_part_cutoff,
                            source=args.source_for_top_hits,
                            with_utrs=args.include_utr_hints)

AUGUSTUS.exonerate_to_hints(
    secondary_hits_gff,
    secondary_hits_gff_hints,
    priority=args.secondary_hits_priority,
    min_intron_len=args.min_intron_len,
    max_intron_len=args.max_intron_len,
    CDS_part_cutoff=args.secondary_hits_CDS_part_cutoff,
    source=args.source_for_secondary_hits,
    with_utrs=args.include_utr_hints)
示例#10
0
                    dest="id_prefix",
                    default="",
                    help="Prefix to use for protein ids")

parser.add_argument("-s",
                    "--stat_file",
                    action="store",
                    dest="stat_file",
                    help="File to write statistics about annotations")
parser.add_argument(
    "-u",
    "--supported_stat_file",
    action="store",
    dest="supp_stat_file",
    help="File to write statistics about annotations supported by hints")
parser.add_argument("-c",
                    "--complete_protein_ids",
                    action="store",
                    dest="complete_protein_id_file",
                    help="File to write ids of complete proteins")

args = parser.parse_args()

AUGUSTUS.extract_proteins_from_output(
    args.input,
    args.output,
    id_prefix=args.id_prefix,
    evidence_stats_file=args.stat_file,
    supported_by_hints_file=args.supp_stat_file,
    complete_proteins_id_file=args.complete_protein_id_file)
示例#11
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Tools.Annotation import AUGUSTUS

parser = argparse.ArgumentParser()

parser.add_argument(
    "-i",
    "--input",
    action="store",
    dest="input",
    required=True,
    help="Comma-separated list of input files with hints in gff")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with merged hints")

args = parser.parse_args()

AUGUSTUS.join_multiple_hints(args.input, args.output)
示例#12
0
parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input file with AUGUSTUS evidence")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file")
parser.add_argument("-d",
                    "--id_file",
                    action="store",
                    dest="id_file",
                    required=True,
                    help="File with ids to extract")
parser.add_argument("-m",
                    "--mode",
                    action="store",
                    dest="mode",
                    default="transcript",
                    help="Prefix of output files. Default - transcript")
args = parser.parse_args()

AUGUSTUS.extract_evidence_by_ids(args.input,
                                 args.id_file,
                                 args.output,
                                 mode=args.mode)
示例#13
0
import argparse
from RouToolPa.Tools.Annotation import AUGUSTUS

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input evidence file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write filtered evidence file")
parser.add_argument("-m",
                    "--min_fraction",
                    action="store",
                    dest="min_fraction",
                    default=0,
                    type=float,
                    help="Minimum fraction of transcript supported by hints")

args = parser.parse_args()

AUGUSTUS.extract_longest_isoforms(args.input,
                                  args.output,
                                  minimum_supported_fraction=args.min_fraction)
示例#14
0
                    required=True,
                    help="Input AUGUSTUS GFF file")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output GFF with exon entries")
parser.add_argument("-e",
                    "--exon_id_prefix",
                    action="store",
                    dest="exon_id_prefix",
                    default="EXON",
                    help="Prefix of exon id. Default: EXON")
parser.add_argument("-n",
                    "--id_digit_num",
                    action="store",
                    dest="id_digit_num",
                    default=8,
                    type=int,
                    help="Number of digits in exon id. Default: 8")

args = parser.parse_args()

AUGUSTUS.add_exon_lines_to_augustus_gff(
    args.input_gff,
    args.output_gff,
    number_of_digits_in_id=args.id_digit_num,
    exon_id_prefix=args.exon_id_prefix,
    new_exon_numering=False)
示例#15
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Tools.Annotation import AUGUSTUS


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True,
                    help="Input gff from AUGUSTUS")
parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True,
                    help="Prefix of output files")
parser.add_argument("-s", "--species_prefix", action="store", dest="species_prefix", required=True,
                    help="Species prefix for ids")
parser.add_argument("-d", "--number_of_digits_in_id", action="store", dest="number_of_digits_in_id", type=int,
                    default=8, help="Number of digits in ids. Default - 8")

args = parser.parse_args()

AUGUSTUS.replace_augustus_ids(args.input_gff, args.output_prefix, species_prefix=args.species_prefix,
                              number_of_digits_in_id=args.number_of_digits_in_id)
示例#16
0
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output gff with replaced ids")
parser.add_argument("-g",
                    "--gene_syn_file",
                    action="store",
                    dest="gene_syn_file",
                    required=True,
                    help="File with gene synonyms")
parser.add_argument("-t",
                    "--transcript_syn_file",
                    action="store",
                    dest="transcript_syn_file",
                    required=True,
                    help="File with transcript synonyms")
parser.add_argument("-c",
                    "--cds_syn_file",
                    action="store",
                    dest="cds_syn_file",
                    help="File with CDS synonyms")

args = parser.parse_args()

AUGUSTUS.replace_augustus_ids_by_syn(args.input_gff,
                                     args.output_gff,
                                     args.gene_syn_file,
                                     args.transcript_syn_file,
                                     cds_syn_file=args.cds_syn_file)