Пример #1
0
    def mkdup(self, input_bam, output_prefix):
        output_bam = "%s.bam" % output_prefix
        stat_file = "%s.stat" % output_prefix

        options = self.parse_options(input_bam, output_bam, stat_file)

        self.execute(options=options)
        SamtoolsV1.index(output_bam)
Пример #2
0
    def clipoverlap(self, input, output, poolsize=None):
        from RouToolPa.Tools.Samtools import SamtoolsV1
        options = self.parse_options(input, output, poolsize=poolsize)

        self.execute(options=options, cmd="bam clipOverlap")
        SamtoolsV1.index(output)
Пример #3
0
              sort_by_name=False,
              max_per_sorting_thread_memory="10G")

if args.add_read_groups_by_picard:
    sorted_alignment_picard_groups = "%s.picard_groups.%s" % (
        args.prefix, args.alignment_format)
    AddOrReplaceReadGroups.add_read_groups(sorted_alignment,
                                           sorted_alignment_picard_groups,
                                           RGID=args.prefix,
                                           RGLB=args.prefix,
                                           RGPL=args.prefix,
                                           RGSM=args.prefix,
                                           RGPU=args.prefix)

if args.alignment_format == "bam":
    SamtoolsV1.index(sorted_alignment_picard_groups
                     if sorted_alignment_picard_groups else sorted_alignment)

MarkDuplicates.run(
    sorted_alignment_picard_groups if sorted_alignment_picard_groups else
    sorted_alignment, final_alignment, duplicates_stat_file)

if args.alignment_format == "bam":
    SamtoolsV1.index(final_alignment)
"""
GenomeCov.get_coverage(final_alignment, genome_bed, coverage_file)
if not args.retain_temp:
    os.remove(sorted_alignment)
    if args.add_read_groups_by_picard:
        os.remove(sorted_alignment_picard_groups)

if args.calculate_median_coverage or args.calculate_mean_coverage:
Пример #4
0
    def align(self,
              genome_dir,
              forward_read_list,
              reverse_read_list=None,
              annotation_gtf=None,
              sample=None,
              feature_from_gtf_to_use_as_exon=None,
              exon_tag_to_use_as_transcript_id=None,
              exon_tag_to_use_as_gene_id=None,
              length_of_sequences_flanking_junction=None,
              junction_tab_file_list=None,
              three_prime_trim=None,
              five_prime_trim=None,
              adapter_seq_for_three_prime_clip=None,
              max_mismatch_percent_for_adapter_trimming=None,
              three_prime_trim_after_adapter_clip=None,
              output_type="BAM",
              sort_bam=True,
              max_memory_per_thread_for_bam_sorting="4G",
              include_unmapped_reads_in_bam=True,
              output_unmapped_reads=True,
              output_dir="./",
              two_pass_mode=False,
              max_intron_length=None):
        if reverse_read_list:
            if len(forward_read_list) != len(reverse_read_list):
                raise ValueError("Wrong read file pairing")

        options = " --runThreadN %i" % self.threads
        options += " --genomeDir %s" % os.path.abspath(genome_dir)
        options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else ""
        options += " --sjdbGTFtagExonParentTranscript %s" % exon_tag_to_use_as_transcript_id if exon_tag_to_use_as_transcript_id else ""
        options += " --sjdbGTFtagExonParentGene %s" % exon_tag_to_use_as_gene_id if exon_tag_to_use_as_gene_id else ""
        options += " --sjdbGTFfeatureExon %s" % feature_from_gtf_to_use_as_exon if feature_from_gtf_to_use_as_exon else ""

        options += " --sjdbOverhang %i" % length_of_sequences_flanking_junction if length_of_sequences_flanking_junction else ""
        options += (" --sjdbFileChrStartEnd %s" %
                    (os.path.abspath(junction_tab_file_list) if isinstance(
                        junction_tab_file_list, str) else " ".join(
                            map(os.path.abspath, junction_tab_file_list)))
                    ) if junction_tab_file_list else ""

        #print(forward_read_list)

        forward_read_abs_path_list = [
            os.path.abspath(forward_read_list)
        ] if isinstance(forward_read_list, str) else list(
            map(os.path.abspath, forward_read_list))
        reverse_read_abs_path_list = (
            [os.path.abspath(reverse_read_list)] if isinstance(
                reverse_read_list, str) else list(
                    map(os.path.abspath,
                        reverse_read_list))) if reverse_read_list else None

        #print(forward_read_abs_path_list)
        forward_read_abs_path_list = self.add_external_extraction_to_filelist(
            forward_read_abs_path_list)
        reverse_read_abs_path_list = self.add_external_extraction_to_filelist(
            reverse_read_abs_path_list) if reverse_read_list else None

        #print(forward_read_abs_path_list)

        options += " --readFilesIn %s" % " ".join(forward_read_abs_path_list)

        options += (
            " %s" %
            " ".join(reverse_read_abs_path_list) if reverse_read_abs_path_list
            else "") if reverse_read_abs_path_list else ""

        options += " --clip3pNbases %i" % three_prime_trim if three_prime_trim else ""
        options += " --clip5pNbases %i" % five_prime_trim if five_prime_trim else ""
        options += " --clip3pAdapterSeq %s" % adapter_seq_for_three_prime_clip if adapter_seq_for_three_prime_clip else ""
        options += " --clip3pAdapterMMp %f" % max_mismatch_percent_for_adapter_trimming if max_mismatch_percent_for_adapter_trimming else ""
        options += " --clip3pAfterAdapterNbases %i" % three_prime_trim_after_adapter_clip if three_prime_trim_after_adapter_clip else ""

        options += " --outSAMtype %s %s" % (
            output_type, "Unsorted"
        )  # "SortedByCoordinate" if sort_bam else "Unsorted")
        #options += " --limitBAMsortRAM %i" % max_memory_for_bam_sorting if max_memory_for_bam_sorting else ""
        options += " --outSAMunmapped Within" if include_unmapped_reads_in_bam else ""
        options += " --outReadsUnmapped Fastx" if output_unmapped_reads else ""
        options += " --outFileNamePrefix %s" % output_dir if output_dir else ""
        options += " --twopassMode Basic" if two_pass_mode else ""
        options += " --alignIntronMax %i" % max_intron_length if max_intron_length else ""

        self.execute(options)

        if sort_bam:
            print("\tSorting...")
            unsorted_bam = "%s/Aligned.out.bam" % output_dir
            sorted_bam = "%s/%s.bam" % (output_dir,
                                        ("%s.sorted" % sample if sample else
                                         "Aligned.sortedByCoord.out"))
            SamtoolsV1.threads = self.threads
            SamtoolsV1.sort(
                unsorted_bam,
                sorted_bam,
                max_memory_per_thread=max_memory_per_thread_for_bam_sorting)

            print("\tIndexing bam file...")
            SamtoolsV1.index(sorted_bam)
Пример #5
0
    def align(self,
              sample_dir,
              reference_index,
              aligner="bwa",
              sample_list=None,
              outdir="./",
              quality_score_type="phred33",
              read_suffix="",
              read_extension="fastq",
              alignment_format="bam",
              threads=None,
              mark_duplicates=True,
              platform="Illumina",
              add_read_groups_by_picard=False,
              gzipped_reads=False):

        self.init_tools(threads=threads)

        samples = self.get_sample_list(sample_dir, sample_list=sample_list)

        self.prepare_dirs(samples, outdir=outdir)

        if aligner == "bowtie2":
            aligner_tool = Bowtie2
        elif aligner == "bwa":
            aligner_tool = BWA
        else:
            raise ValueError("")

        for sample in samples:
            read_prefix = "%s/%s/%s%s" % (sample_dir, sample, sample,
                                          read_suffix)
            forward_reads = "%s_1.%s%s" % (read_prefix, read_extension,
                                           ".gz" if gzipped_reads else "")
            reverse_reads = "%s_2.%s%s" % (read_prefix, read_extension,
                                           ".gz" if gzipped_reads else "")

            output_prefix = "%s/%s/%s" % (outdir, sample, sample)

            raw_alignment = "%s.%s" % (output_prefix, alignment_format)
            final_alignment = "%s.mkdup.%s" % (output_prefix, alignment_format)

            duplicates_stat_file = "%s.duplicates.stat" % output_prefix
            coverage_file = "%s.coverage.bed" % output_prefix

            sorted_alignment_picard_groups = None

            aligner_tool.align(
                reference_index,
                forward_reads_list=forward_reads,
                reverse_reads_list=reverse_reads,
                unpaired_reads_list=None,
                quality_score=quality_score_type,
                output_prefix=output_prefix,
                output_format=alignment_format,
                read_group_name=sample,
                PU="x",
                SM=sample,
                platform=platform,
                LB="x",
                sort_by_coordinate=True,
                sort_by_name=False,
                max_per_sorting_thread_memory=str(
                    max(int(self.max_memory / self.threads), 1)) + "G")

            if add_read_groups_by_picard:
                sorted_alignment_picard_groups = "%s.picard_groups.%s" % (
                    output_prefix, alignment_format)
                AddOrReplaceReadGroups.add_read_groups(
                    raw_alignment,
                    sorted_alignment_picard_groups,
                    RGID=sample,
                    RGLB=sample,
                    RGPL=platform,
                    RGSM=sample,
                    RGPU=sample)

            if alignment_format == "bam":
                SamtoolsV1.index(
                    sorted_alignment_picard_groups
                    if sorted_alignment_picard_groups else raw_alignment)

            if mark_duplicates:
                MarkDuplicates.run(
                    sorted_alignment_picard_groups
                    if sorted_alignment_picard_groups else raw_alignment,
                    final_alignment, duplicates_stat_file)

                if alignment_format == "bam":
                    SamtoolsV1.index(final_alignment)
Пример #6
0
sample_list = args.samples if args.samples else Pipeline.get_sample_list(
    args.samples_dir)

FileRoutines.safe_mkdir(args.output_dir)

for sample in sample_list:
    print("Handling %s" % sample)
    sample_dir = "%s/%s/" % (args.samples_dir, sample)
    alignment_sample_dir = "%s/%s/" % (args.output_dir, sample)
    FileRoutines.safe_mkdir(alignment_sample_dir)
    filetypes, forward_files, reverse_files, se_files = FileRoutines.make_lists_forward_and_reverse_files(
        sample_dir)

    print("\tAligning reads...")

    STAR.align_miRNA(
        args.genome_dir,
        se_files,
        output_dir=alignment_sample_dir,
        annotation_gtf=args.annotation_gtf if not args.genome_fasta else None,
        max_memory_for_bam_sorting=args.max_memory_for_bam_sorting,
        max_alignments_per_read=args.max_number_of_alignments_per_read,
        no_soft_clip=args.enable_soft_clipping,
        max_number_of_mismatches=args.max_number_of_mismatches,
        max_relative_number_of_mismatches=args.
        max_relative_number_of_mismatches)

    print("\tIndexing bam file...")
    resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir
    SamtoolsV1.index(resulting_bam_file)