final_gff = "%s.final.gff" % args.output final_CDS_gff = "%s.final.CDS.gff" % args.output AUGUSTUS.path = args.augustus_dir AUGUSTUS.threads = args.threads print("Annotating genes...") AUGUSTUS.parallel_predict(args.species, args.input, output_raw_gff, strand=args.strand, gene_model=args.gene_model, output_gff3=True, other_options=args.other_options, config_dir=args.config_dir, use_softmasking=args.softmasking, hints_file=args.hintsfile, extrinsicCfgFile=args.extrinsicCfgFile, predict_UTR=args.predict_UTR, parsing_mode="parse") AUGUSTUS.replace_augustus_ids(output_raw_gff, args.output, species_prefix=args.species_prefix, number_of_digits_in_id=8) Gffread.extract_transcript_sequences(output_gff, args.input, args.output) SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Tools.Annotation import AUGUSTUS parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="File with junctions from STAR output") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="File to write gff with intron hints") parser.add_argument("-m", "--min_supporting_uniquely_mapped_reads", action="store", dest="min_supporting_uniquely_mapped_reads", default=1, type=int, help="Minimum number of uniquely mapped reads supporting reads to retain junction. " "Default: 1, i.e. only junctions supported by at least one uniquely mapped read will be " ".To retain all junctions set to 0") parser.add_argument("-s", "--source", action="store", dest="source", default="RNASEQ", help="Source of hints. Default: RNASEQ") parser.add_argument("-p", "--priority", action="store", dest="priority", default=100, type=int, help="Priority of hints. Default: 100") args = parser.parse_args() AUGUSTUS.convert_star_junctions_to_intron_hints(args.input, args.output, min_supporting_uniquely_mapped_reads=args.min_supporting_uniquely_mapped_reads, source=args.source, priority=100)
input_other_top_gff = "%s.other_top.target.gff" % args.input_prefix input_other_secondary_gff = "%s.other_secondary.target.gff" % args.input_prefix precise_top_hints = "%s.precise_top.target.hints.gff" % args.output_prefix precise_secondary_hints = "%s.precise_secondary.target.hints.gff" % args.output_prefix other_top_hints = "%s.other_top.target.hints.gff" % args.output_prefix other_secondary_hints = "%s.other_secondary.target.hints.gff" % args.output_prefix AUGUSTUS.path = args.augustus_script_dir AUGUSTUS.exonerate_to_hints(input_precise_top_gff, precise_top_hints, priority=args.full_top_hits_priority, min_intron_len=args.min_intron_len, max_intron_len=args.max_intron_len, CDS_part_cutoff=args.full_top_hits_CDS_part_cutoff, source=args.source_for_full_top_hits, with_utrs=args.include_utr_hints) AUGUSTUS.exonerate_to_hints( input_precise_secondary_gff, precise_secondary_hints, priority=args.full_secondary_hits_priority, min_intron_len=args.min_intron_len, max_intron_len=args.max_intron_len, CDS_part_cutoff=args.full_secondary_hits_CDS_part_cutoff, source=args.source_for_full_secondary_hits, with_utrs=args.include_utr_hints) AUGUSTUS.exonerate_to_hints(
def predict_genes(self, output_prefix, annotation_species_prefix, genome_fasta, augustus_species, output_directory="./", augustus_strand=None, augustus_gene_model=None, augustus_config_dir=None, augustus_use_softmasking=None, augustus_other_options="", augustus_hintsfile=None, augustus_extrinsicCfgFile=None, augustus_predict_UTR=None, augustus_min_intron_len=None, threads=1, augustus_dir="", hmmer_dir="", blast_dir="", stop_codons_list=("TGA", "TAA", "TAG"), genetic_code_table=1): draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix) augustus_splited_input_dir = "%s/splited_input/" % output_directory augustus_splited_output_dir = "%s/splited_output_dir" % output_directory output_raw_gff = "%s.raw.gff" % draft_file_prefix output_gff = "%s.renamed.gff" % draft_file_prefix augustus_pep = "%s.pep" % draft_file_prefix AUGUSTUS.path = augustus_dir AUGUSTUS.threads = threads HMMER3.path = hmmer_dir HMMER3.threads = threads BLASTp.path = blast_dir BLASTp.threads = threads print("Annotating genes...") AUGUSTUS.parallel_predict( augustus_species, genome_fasta, output_raw_gff, strand=augustus_strand, gene_model=augustus_gene_model, output_gff3=True, other_options=augustus_other_options, config_dir=augustus_config_dir, use_softmasking=augustus_use_softmasking, hints_file=augustus_hintsfile, split_dir=augustus_splited_input_dir, splited_output_dir=augustus_splited_output_dir, extrinsicCfgFile=augustus_extrinsicCfgFile, predict_UTR=augustus_predict_UTR, combine_output_to_single_file=True, min_intron_len=augustus_min_intron_len) #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): AUGUSTUS.replace_augustus_ids(output_raw_gff, draft_file_prefix, species_prefix=annotation_species_prefix, number_of_digits_in_id=8) #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False) gffread_file_prefix = "%s.gffread" % draft_file_prefix gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences( output_gff, genome_fasta, gffread_file_prefix) gffread_trimmed_cds = ".".join( gffread_cds_file.split(".")[:-1]) + ".trimmed.cds" gffread_trimmed_pep = ".".join( gffread_pep_file.split(".")[:-1]) + ".trimmed.pep" self.trim_cds_and_remove_terminal_stop_codons( gffread_cds_file, gffread_trimmed_cds, stop_codons_list=stop_codons_list ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix self.translate_sequences_from_file( gffread_trimmed_cds, gffread_trimmed_pep, format="fasta", id_expression=None, genetic_code_table=genetic_code_table, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqsin= inframe_stop_codons_file_prefix) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) print("Extracting peptides...") AUGUSTUS.extract_proteins_from_output( output_gff, output_pep, id_prefix="", evidence_stats_file=output_evidence_stats, supported_by_hints_file=output_supported_stats) self.compare_sequences_from_files(output_pep, "%s.trimmed.pep" % args.output, "comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) print("Annotating domains(Pfam database)...") HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) print("Annotating peptides(Swissprot database)...") BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output( output_swissprot_blastp_hits, output_swissprot_blastp_hits_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_swissprot_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_swissprot_supported_transcripts_ids, output_swissprot_supported_genes_ids) os.system(remove_transcript_ids_str) """
def prepare_rnaseq_hints(self, rnaseq_read_dir, rnaseq_alignment_dir, genome_dir, hints_dir, genome_fasta=None, samples=None, annotation_gtf=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=8000000000, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=True, max_intron_length=None, min_reads_supporting_junction_hint=1, source="RNASEQ", priority=100, threads=1, STAR_path=""): STAR.threads = threads STAR.path = STAR_path samples = samples if samples else self.get_sample_list(rnaseq_read_dir) STAR.align_samples( rnaseq_read_dir, rnaseq_alignment_dir, genome_dir, genome_fasta=genome_fasta, samples=samples, annotation_gtf=annotation_gtf, sjdboverhang=sjdboverhang, genomeSAindexNbases=genomeSAindexNbases, genomeChrBinNbits=genomeChrBinNbits, genome_size=genome_size, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id=exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip=adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) for sample in samples: sample_junction_file = "%s/%s/SJ.out.tab " % (rnaseq_alignment_dir, sample) sample_hint_gff = "%s/rnaseq.hints.%s.gff" % (hints_dir, sample) AUGUSTUS.convert_star_junctions_to_intron_hints( sample_junction_file, sample_hint_gff, min_supporting_reads=min_reads_supporting_junction_hint, source=source, priority=priority)
help="Output file with synonyms") parser.add_argument("-p", "--id_prefix", action="store", dest="id_prefix", required=True, help="Prefix of id") parser.add_argument("-n", "--number_of_digits_in_number", action="store", dest="number_of_digits_in_number", type=int, default=8, help="Number of digits in id. Default - 8") parser.add_argument( "-f", "--feature_type", action="store", dest="feature_type", default="gene", help="Type of feature to assign synonyms. Default - 'gene'") args = parser.parse_args() AUGUSTUS.assign_synonyms_to_features_from_augustus_gff( args.input_gff, args.output, args.id_prefix, number_of_digits_in_number=args.number_of_digits_in_number, feature_type=args.feature_type)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Tools.Annotation import AUGUSTUS parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input evidence file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") args = parser.parse_args() AUGUSTUS.draw_evidence_figures(args.input, args.output_prefix)
dest="input_gff", required=True, help="Input gff from AUGUSTUS") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-p", "--species_prefix", action="store", dest="species_prefix", required=True, help="Species prefix to use in ids") parser.add_argument("-n", "--number_of_digits_in_number", action="store", dest="number_of_digits_in_number", type=int, default=8, help="Number of digits in id. Default - 8") args = parser.parse_args() AUGUSTUS.assign_synonyms_to_annotations_from_augustus_gff( args.input_gff, args.output_prefix, args.species_prefix, number_of_digits_in_number=args.number_of_digits_in_number)
secondary_hits_gff = "%s.target.secondary_hits.gff" % args.output_prefix top_hits_gff_hints = "%s.target.top_hits.hints.gff" % args.output_prefix secondary_hits_gff_hints = "%s.target.secondary_hits.hints.gff" % args.output_prefix Exonerate.extract_top_hits_from_target_gff( args.input, top_hits_gff, secondary_hits_gff, id_white_list_file=args.white_id_file, max_hits_per_query=args.max_hits_per_query) AUGUSTUS.path = args.augustus_script_dir AUGUSTUS.exonerate_to_hints(top_hits_gff, top_hits_gff_hints, priority=args.top_hits_priority, min_intron_len=args.min_intron_len, max_intron_len=args.max_intron_len, CDS_part_cutoff=args.top_hits_CDS_part_cutoff, source=args.source_for_top_hits, with_utrs=args.include_utr_hints) AUGUSTUS.exonerate_to_hints( secondary_hits_gff, secondary_hits_gff_hints, priority=args.secondary_hits_priority, min_intron_len=args.min_intron_len, max_intron_len=args.max_intron_len, CDS_part_cutoff=args.secondary_hits_CDS_part_cutoff, source=args.source_for_secondary_hits, with_utrs=args.include_utr_hints)
dest="id_prefix", default="", help="Prefix to use for protein ids") parser.add_argument("-s", "--stat_file", action="store", dest="stat_file", help="File to write statistics about annotations") parser.add_argument( "-u", "--supported_stat_file", action="store", dest="supp_stat_file", help="File to write statistics about annotations supported by hints") parser.add_argument("-c", "--complete_protein_ids", action="store", dest="complete_protein_id_file", help="File to write ids of complete proteins") args = parser.parse_args() AUGUSTUS.extract_proteins_from_output( args.input, args.output, id_prefix=args.id_prefix, evidence_stats_file=args.stat_file, supported_by_hints_file=args.supp_stat_file, complete_proteins_id_file=args.complete_protein_id_file)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Tools.Annotation import AUGUSTUS parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", action="store", dest="input", required=True, help="Comma-separated list of input files with hints in gff") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with merged hints") args = parser.parse_args() AUGUSTUS.join_multiple_hints(args.input, args.output)
parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input file with AUGUSTUS evidence") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids to extract") parser.add_argument("-m", "--mode", action="store", dest="mode", default="transcript", help="Prefix of output files. Default - transcript") args = parser.parse_args() AUGUSTUS.extract_evidence_by_ids(args.input, args.id_file, args.output, mode=args.mode)
import argparse from RouToolPa.Tools.Annotation import AUGUSTUS parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input evidence file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="File to write filtered evidence file") parser.add_argument("-m", "--min_fraction", action="store", dest="min_fraction", default=0, type=float, help="Minimum fraction of transcript supported by hints") args = parser.parse_args() AUGUSTUS.extract_longest_isoforms(args.input, args.output, minimum_supported_fraction=args.min_fraction)
required=True, help="Input AUGUSTUS GFF file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output GFF with exon entries") parser.add_argument("-e", "--exon_id_prefix", action="store", dest="exon_id_prefix", default="EXON", help="Prefix of exon id. Default: EXON") parser.add_argument("-n", "--id_digit_num", action="store", dest="id_digit_num", default=8, type=int, help="Number of digits in exon id. Default: 8") args = parser.parse_args() AUGUSTUS.add_exon_lines_to_augustus_gff( args.input_gff, args.output_gff, number_of_digits_in_id=args.id_digit_num, exon_id_prefix=args.exon_id_prefix, new_exon_numering=False)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Tools.Annotation import AUGUSTUS parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="Input gff from AUGUSTUS") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-s", "--species_prefix", action="store", dest="species_prefix", required=True, help="Species prefix for ids") parser.add_argument("-d", "--number_of_digits_in_id", action="store", dest="number_of_digits_in_id", type=int, default=8, help="Number of digits in ids. Default - 8") args = parser.parse_args() AUGUSTUS.replace_augustus_ids(args.input_gff, args.output_prefix, species_prefix=args.species_prefix, number_of_digits_in_id=args.number_of_digits_in_id)
"--output_gff", action="store", dest="output_gff", required=True, help="Output gff with replaced ids") parser.add_argument("-g", "--gene_syn_file", action="store", dest="gene_syn_file", required=True, help="File with gene synonyms") parser.add_argument("-t", "--transcript_syn_file", action="store", dest="transcript_syn_file", required=True, help="File with transcript synonyms") parser.add_argument("-c", "--cds_syn_file", action="store", dest="cds_syn_file", help="File with CDS synonyms") args = parser.parse_args() AUGUSTUS.replace_augustus_ids_by_syn(args.input_gff, args.output_gff, args.gene_syn_file, args.transcript_syn_file, cds_syn_file=args.cds_syn_file)