Пример #1
0
    def __init__(self, chromosome_name, bam_file_path, draft_file_path, truth_bam, hp_tag, train_mode):
        """
        Initialize a manager object
        :param chromosome_name: Name of the chromosome
        :param bam_file_path: Path to the BAM file
        :param draft_file_path: Path to the reference FASTA file
        :param truth_bam: Path to the truth sequence to reference mapping file
        """
        # --- initialize handlers ---
        # create objects to handle different files and query
        self.bam_path = bam_file_path
        self.fasta_path = draft_file_path
        self.bam_handler = PEPPER_HP.BAM_handler(bam_file_path)
        self.fasta_handler = PEPPER_HP.FASTA_handler(draft_file_path)
        self.train_mode = train_mode
        self.hp_tag = hp_tag
        self.downsample_rate = 1.0
        self.truth_bam = None

        if self.train_mode:
            self.truth_bam = truth_bam

        # --- initialize names ---
        # name of the chromosome
        self.chromosome_name = chromosome_name
Пример #2
0
    def __init__(self, reference_file_path, contigs, sample_name, output_dir,
                 filename):
        self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path)
        self.contigs = contigs
        vcf_header = self.get_vcf_header(sample_name, contigs)

        self.vcf_file = VariantFile(output_dir + filename + '.vcf',
                                    'w',
                                    header=vcf_header)
Пример #3
0
    def __init__(self, vcf1, vcf2, ref_fasta, only_overlapping=True, discard_phase=False, detailed_info=False):
        """Initialize variant merging.

        Merge variants from two haploid VCFs into a diploid vcf. Variants in
        one file which overlap with variants in the other will have their alts
        padded.

        .. warning::

            Variants in a single vcf file should not overlap with each other.

        :param vcf1, vcf2: paths to haploid vcf files.
        :param ref_fasta: path to reference.fasta file.
        :param only_overlapping: bool, merge only overlapping variants (not
            adjacent ones).
        :param discard_phase: bool, if False, preserve phase, else output
            unphased variants.

        """
        self.only_overlapping = only_overlapping
        self.discard_phase = discard_phase
        self.detailed_info = detailed_info

        self.vcfs = [VCFReader(vcf) for vcf in (vcf1, vcf2)]
        for vcf in self.vcfs:
            vcf.index()  # create tree
        self.fasta = pysam.FastaFile(ref_fasta)
        all_contigs = list(set(itertools.chain(*[v.chroms for v in self.vcfs])))
        all_contigs = sorted(all_contigs, key=natural_key)

        fasta_handler = PEPPER_HP.FASTA_handler(ref_fasta)
        sqs = fasta_handler.get_chromosome_names()

        self.chroms = []
        for sq in sqs:
            if sq not in all_contigs:
                continue
            sq_id = sq
            ln = fasta_handler.get_chromosome_sequence_length(sq)
            self.chroms.append((sq_id, ln))
Пример #4
0
    def get_chromosome_list(chromosome_names, ref_file, bam_file, region_bed):
        """
        PARSES THROUGH THE CHROMOSOME PARAMETER TO FIND OUT WHICH REGIONS TO PROCESS
        :param chromosome_names: NAME OF CHROMOSOME
        :param ref_file: PATH TO THE REFERENCE FILE
        :param bam_file: PATH TO BAM FILE
        :return: LIST OF CHROMOSOME IN REGION SPECIFIC FORMAT
        """
        if not chromosome_names and not region_bed:
            fasta_handler = PEPPER_HP.FASTA_handler(ref_file)
            bam_handler = PEPPER_HP.BAM_handler(bam_file)
            bam_contigs = bam_handler.get_chromosome_sequence_names()
            fasta_contigs = fasta_handler.get_chromosome_names()
            common_contigs = list(set(fasta_contigs) & set(bam_contigs))
            common_contigs = list(set(common_contigs) - set(EXCLUDED_HUMAN_CONTIGS))

            if len(common_contigs) == 0:
                sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] "
                                 + "ERROR: NO COMMON CONTIGS FOUND BETWEEN THE BAM FILE AND THE FASTA FILE.")
                sys.stderr.flush()
                exit(1)

            common_contigs = sorted(common_contigs, key=UserInterfaceSupport.natural_key)
            sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: COMMON CONTIGS FOUND: " + str(common_contigs) + "\n")
            sys.stderr.flush()

            chromosome_name_list = []
            for contig_name in common_contigs:
                chromosome_name_list.append((contig_name, None))

            return chromosome_name_list

        if region_bed:
            chromosome_name_list = []
            with open(region_bed) as fp:
                line = fp.readline()
                cnt = 1
                while line:
                    line_to_list = line.rstrip().split('\t')
                    chr_name, start_pos, end_pos = line_to_list[0], int(line_to_list[1]), int(line_to_list[2])
                    region = sorted([start_pos, end_pos])
                    chromosome_name_list.append((chr_name, region))
                    line = fp.readline()
                cnt += 1
            return chromosome_name_list

        split_names = chromosome_names.strip().split(',')
        split_names = [name.strip() for name in split_names]

        chromosome_name_list = []
        for name in split_names:
            # split on region
            region = None
            if ':' in name:
                name_region = name.strip().split(':')

                if len(name_region) != 2:
                    sys.stderr.write("ERROR: --region INVALID value.\n")
                    exit(0)

                name, region = tuple(name_region)
                region = region.strip().split('-')
                region = [int(pos) for pos in region]

                if len(region) != 2 or not region[0] <= region[1]:
                    sys.stderr.write("ERROR: --region INVALID value.\n")
                    exit(0)

            range_split = name.split('-')
            if len(range_split) > 1:
                chr_prefix = ''
                for p in name:
                    if p.isdigit():
                        break
                    else:
                        chr_prefix = chr_prefix + p

                int_ranges = []
                for item in range_split:
                    s = ''.join(i for i in item if i.isdigit())
                    int_ranges.append(int(s))
                int_ranges = sorted(int_ranges)

                for chr_seq in range(int_ranges[0], int_ranges[-1] + 1):
                    chromosome_name_list.append((chr_prefix + str(chr_seq), region))
            else:
                chromosome_name_list.append((name, region))

        return chromosome_name_list
Пример #5
0
    def chromosome_level_parallelization(chr_list,
                                         bam_file,
                                         draft_file,
                                         truth_bam,
                                         hp_tag,
                                         output_path,
                                         total_threads,
                                         train_mode,
                                         realignment_flag):

        if train_mode:
            max_size = 10000
        else:
            max_size = 10000

        start_time = time.time()
        fasta_handler = PEPPER_HP.FASTA_handler(draft_file)

        all_intervals = []
        # first calculate all the intervals that we need to process
        for chr_name, region in chr_list:
            # contig update message
            if not region:
                interval_start, interval_end = (0, fasta_handler.get_chromosome_sequence_length(chr_name) - 1)
            else:
                interval_start, interval_end = tuple(region)
                interval_start = max(0, interval_start)
                interval_end = min(interval_end, fasta_handler.get_chromosome_sequence_length(chr_name) - 1)

            # this is the interval size each of the process is going to get which is 10^6
            # I will split this into 10^4 size inside the worker process
            for pos in range(interval_start, interval_end, max_size):
                pos_start = max(interval_start, pos - ImageSizeOptions.MIN_IMAGE_OVERLAP)
                pos_end = min(interval_end, pos + max_size + ImageSizeOptions.MIN_IMAGE_OVERLAP)
                all_intervals.append((chr_name, pos_start, pos_end))

        # all intervals calculated now
        # contig update message
        sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] "
                         + "INFO: TOTAL CONTIGS: " + str(len(chr_list))
                         + " TOTAL INTERVALS: " + str(len(all_intervals)) + "\n")
        sys.stderr.flush()

        args = (output_path, bam_file, draft_file, truth_bam, hp_tag, train_mode, realignment_flag)
        with concurrent.futures.ProcessPoolExecutor(max_workers=total_threads) as executor:
            futures = [executor.submit(UserInterfaceSupport.image_generator, args, all_intervals, total_threads,
                                       thread_id)
                       for thread_id in range(0, total_threads)]

            for fut in concurrent.futures.as_completed(futures):
                if fut.exception() is None:
                    # get the results
                    thread_id = fut.result()
                    sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: THREAD "
                                     + str(thread_id) + " FINISHED SUCCESSFULLY.\n")
                else:
                    sys.stderr.write("ERROR: " + str(fut.exception()) + "\n")
                fut._result = None  # python issue 27144

        end_time = time.time()
        mins = int((end_time - start_time) / 60)
        secs = int((end_time - start_time)) % 60
        sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED IMAGE GENERATION\n")
        sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: ELAPSED TIME: " + str(mins) + " Min " + str(secs) + " Sec\n")