示例#1
0
文件: STAR.py 项目: mahajrod/Pantera
    def index(self,
              genome_dir,
              genome_fasta,
              annotation_gtf=None,
              junction_tab_file=None,
              sjdboverhang=None,
              genomeSAindexNbases=None,
              genomeChrBinNbits=None,
              genome_size=None):

        FileRoutines.safe_mkdir(genome_dir)

        options = "--runMode genomeGenerate"
        options += " --genomeDir %s" % os.path.abspath(genome_dir)
        options += " --runThreadN %i" % self.threads
        options += " --genomeFastaFiles %s" % (
            os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else
            " ".join(map(os.path.abspath, genome_fasta)))
        options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else ""
        options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else ""
        options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else ""  # number of bases taken from both sides of splice junction. 100 by default
        if genome_size:
            options += " --genomeSAindexNbases %i" % min(
                [14, (floor(log(genome_size, 2) / 2)) - 1])
        else:
            options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else ""  # size of k-mers used for preindexing of suffix array
        options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else ""  # padding size (log2) of reference sequences. 18 by default
        # recommended value min(18, log2(GenomeLength/NumberOfScaffolds))
        self.execute(options)
示例#2
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
示例#3
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix="alignment",
                       gap_open_penalty=None,
                       offset=None,
                       maxiterate=None,
                       quiet=False,
                       mode="globalpair",
                       number_of_processes=1,
                       anysymbol=False):
        # TODO: add rest of options

        options = " --thread %i" % self.threads
        options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else ""
        options += " --ep %f" % offset if offset is not None else ""
        options += " --maxiterate %i" % maxiterate if maxiterate is not None else ""
        options += " --quiet" if quiet else ""
        options += " --%s" % mode
        options += " --anysymbol" if anysymbol else ""
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = options
            op += " %s" % filename
            op += " > %s/%s.fasta" % (output_directory,
                                      ("%s_%s" % (basename, output_suffix))
                                      if output_suffix else basename)
            options_list.append(op)

        self.parallel_execute(options_list, threads=number_of_processes)
示例#4
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix=None,
                       tree_file=None,
                       output_format=None,
                       show_xml=None,
                       show_tree=None,
                       show_ancestral_sequences=None,
                       show_evolutionary_events=None,
                       showall=None,
                       compute_posterior_support=None,
                       njtree=None,
                       skip_insertions=False,
                       codon_alignment=None,
                       translated_alignment=None):

        common_options = self.parse_common_options(
            tree_file=tree_file,
            output_format=output_format,
            show_xml=show_xml,
            show_tree=show_tree,
            show_ancestral_sequences=show_ancestral_sequences,
            show_evolutionary_events=show_evolutionary_events,
            showall=showall,
            compute_posterior_support=compute_posterior_support,
            njtree=njtree,
            skip_insertions=skip_insertions,
            codon_alignment=codon_alignment,
            translated_alignment=translated_alignment)

        FileRoutines.safe_mkdir(output_directory)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " -d=%s" % filename
            op += " -o=%s/%s.fasta" % (output_directory,
                                       ("%s_%s" % (basename, output_suffix))
                                       if output_suffix else basename)
            options_list.append(op)

        self.parallel_execute(options_list)
示例#5
0
print("Drawing histograms...")

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:

    MatplotlibRoutines.percent_histogram_from_file(
        stat_file,
        stat_file,
        data_type=None,
        column_list=(2, ),
        comments="#",
        n_bins=20,
        title="Transcript support by hints",
        extensions=("png", "svg"),
        legend_location="upper center",
        stats_as_legend=True)
print("Creating final directories...")
if args.pfam_db and args.swissprot_db:
    db_or_hints_dir = "supported_by_db_or_hints/"
    db_and_hints_dir = "supported_by_db_and_hints/"
    for directory in db_and_hints_dir, db_or_hints_dir:
        FileRoutines.safe_mkdir(directory)

    os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" %
              (args.output, db_or_hints_dir))
    os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" %
              (args.output, db_and_hints_dir))
示例#6
0
    def mask(self, list_of_fasta_files, output_dir="./", soft_masking=True, engine="ncbi",
             search_speed="normal", no_low_complexity=None, only_low_complexity=None,
             no_interspersed=None, only_interspersed=None, no_rna=None, only_alu=None, custom_library=None,
             species=None, html_output=False, ace_output=False, gff_output=False):

        if species and custom_library:
            tmp_repeat_file = "%s/%s.repeats.tmp.fa" % (output_dir, species)
            tmp_repeats_all_file = "%s/all.repeats.tmp.fasta" % output_dir
            self.extract_repeats_from_database(tmp_repeat_file, species=species)

            cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library, tmp_repeats_all_file)
            self.execute(cmd=cmd)

        options = " -pa %i" % self.threads
        options += " -e %s" % engine

        if search_speed == "slow":
            options += " -s"
        elif search_speed == "quick":
            options += " -q"
        elif search_speed == "rush":
            options += " -qq"

        options += " -nolow" if no_low_complexity else ""
        options += " -low" if only_low_complexity else ""
        options += " -noint" if no_interspersed else ""
        options += " -int" if only_interspersed else ""
        options += " -norna" if no_rna else ""
        options += " -alu" if only_alu else ""

        if species and custom_library:
            options += " -lib %s" % tmp_repeats_all_file
        elif custom_library:
            options += " -lib %s" % custom_library if custom_library else ""
        elif species:
            options += " -species %s" % species if species else ""

        options += " -dir %s" % output_dir
        options += " -html" if html_output else ""
        options += " -ace" if ace_output else ""
        options += " -gff" if gff_output else ""
        options += " -xsmall" if soft_masking else ""

        options += " " + (list_of_fasta_files if isinstance(list_of_fasta_files, str) else  " ".join(FileRoutines.make_list_of_path_to_files(list_of_fasta_files)))

        self.execute(options=options)

        """
示例#7
0
    def parallel_predict(self,
                         species,
                         genome_file,
                         output,
                         strand="both",
                         gene_model=None,
                         output_gff3=True,
                         other_options="",
                         split_dir="splited_input",
                         splited_output_dir="splited_output_dir",
                         config_dir=None,
                         combine_output_to_single_file=True,
                         use_softmasking=None,
                         hints_file=None,
                         extrinsicCfgFile=None,
                         predict_UTR=None,
                         external_process_pool=None,
                         async_run=False,
                         min_intron_len=None,
                         parsing_mode="parse"):
        common_options = self.parse_options(species,
                                            genome_file="",
                                            strand=strand,
                                            gene_model=gene_model,
                                            output_gff3=output_gff3,
                                            other_options=other_options,
                                            config_dir=config_dir,
                                            use_softmasking=use_softmasking,
                                            hints_file=hints_file,
                                            extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR,
                                            min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file,
                                    splited_dir,
                                    parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list,
                              external_process_pool=external_process_pool,
                              async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)
示例#8
0
文件: STAR.py 项目: mahajrod/Pantera
    def align_samples(self,
                      samples_dir,
                      output_dir,
                      genome_dir,
                      genome_fasta=None,
                      samples=None,
                      annotation_gtf=None,
                      sjdboverhang=None,
                      genomeSAindexNbases=None,
                      genomeChrBinNbits=None,
                      genome_size=None,
                      feature_from_gtf_to_use_as_exon=None,
                      exon_tag_to_use_as_transcript_id=None,
                      exon_tag_to_use_as_gene_id=None,
                      length_of_sequences_flanking_junction=None,
                      junction_tab_file_list=None,
                      three_prime_trim=None,
                      five_prime_trim=None,
                      adapter_seq_for_three_prime_clip=None,
                      max_mismatch_percent_for_adapter_trimming=None,
                      three_prime_trim_after_adapter_clip=None,
                      output_type="BAM",
                      sort_bam=True,
                      max_memory_for_bam_sorting=8000000000,
                      include_unmapped_reads_in_bam=True,
                      output_unmapped_reads=True,
                      two_pass_mode=True,
                      max_intron_length=None):
        #STAR.threads = threads
        #STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=annotation_gtf,
                       junction_tab_file=junction_tab_file_list,
                       sjdboverhang=sjdboverhang,
                       genomeSAindexNbases=genomeSAindexNbases,
                       genomeChrBinNbits=genomeChrBinNbits,
                       genome_size=genome_size)

        sample_list = samples if samples else self.get_sample_list(samples_dir)

        FileRoutines.safe_mkdir(output_dir)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_dir, sample)
            alignment_sample_dir = "%s/%s/" % (output_dir, sample)
            FileRoutines.safe_mkdir(alignment_sample_dir)
            filetypes, forward_files, reverse_files, se_files = FileRoutines.make_lists_forward_and_reverse_files(
                sample_dir)

            print "\tAligning reads..."

            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf if not genome_fasta else None,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_for_bam_sorting=max_memory_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            print "\tIndexing bam file..."
            resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir
            SamtoolsV1.index(resulting_bam_file)