Exemplo n.º 1
0
    def get_ru_count_with_coverage_method(self, pattern_occurrences,
                                          total_counted_vntr_bp,
                                          average_coverage):
        haplotypes = 1 if self.is_haploid else 2
        estimate = [
            int(pattern_occurrences / (float(average_coverage) * haplotypes))
        ] * 2
        return estimate
        pattern_occurrences = total_counted_vntr_bp / float(
            len(self.reference_vntr.pattern))
        read_mode = 'r' if alignment_file.endswith('sam') else 'rb'
        samfile = pysam.AlignmentFile(alignment_file, read_mode)
        reference = get_reference_genome_of_alignment_file(samfile)
        bias_detector = CoverageBiasDetector(alignment_file,
                                             self.reference_vntr.chromosome,
                                             reference)
        coverage_corrector = CoverageCorrector(
            bias_detector.get_gc_content_coverage_map())

        logging.info('Sequencing mean coverage: %s' %
                     coverage_corrector.get_sequencing_mean_coverage())
        observed_copy_number = pattern_occurrences / coverage_corrector.get_sequencing_mean_coverage(
        )
        scaled_copy_number = coverage_corrector.get_scaled_coverage(
            self.reference_vntr, observed_copy_number)
        logging.info('scaled copy number and observed copy number: %s, %s' %
                     (scaled_copy_number, observed_copy_number))
        return [scaled_copy_number]
Exemplo n.º 2
0
    def get_spanning_reads_of_aligned_pacbio_reads(self, alignment_file):
        sema = Semaphore(settings.CORES)
        manager = Manager()
        length_distribution = manager.list()
        mapped_spanning_reads = manager.list()

        vntr_start = self.reference_vntr.start_point
        vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length(
        )
        region_start = vntr_start
        region_end = vntr_end
        read_mode = 'r' if alignment_file.endswith('sam') else 'rb'
        samfile = pysam.AlignmentFile(alignment_file, read_mode)
        reference = get_reference_genome_of_alignment_file(samfile)
        chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[
            3:]
        process_list = []
        for read in samfile.fetch(chromosome, region_start, region_end):
            sema.acquire()
            p = Process(target=self.check_if_pacbio_read_spans_vntr,
                        args=(sema, read, length_distribution,
                              mapped_spanning_reads))
            process_list.append(p)
            p.start()

        for p in process_list:
            p.join()

        logging.info('length_distribution of mapped spanning reads: %s' %
                     list(length_distribution))
        return list(mapped_spanning_reads)
Exemplo n.º 3
0
    def select_illumina_reads(self,
                              alignment_file,
                              unmapped_filtered_reads,
                              update=False,
                              hmm=None):
        recruitment_score = None
        sema = Semaphore(settings.CORES)
        manager = Manager()
        selected_reads = manager.list()
        vntr_bp_in_unmapped_reads = Value('d', 0.0)

        number_of_reads = 0
        read_length = 150

        process_list = []

        for read_segment in unmapped_filtered_reads:
            if number_of_reads == 0:
                read_length = len(str(read_segment.seq))
            number_of_reads += 1
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)
            if not recruitment_score:
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)

            if len(read_segment.seq) < read_length:
                continue

            sema.acquire()
            p = Process(target=self.process_unmapped_read,
                        args=(sema, str(read_segment.seq), hmm,
                              recruitment_score, vntr_bp_in_unmapped_reads,
                              selected_reads))
            process_list.append(p)
            p.start()
        for p in process_list:
            p.join()

        logging.debug('vntr base pairs in unmapped reads: %s' %
                      vntr_bp_in_unmapped_reads.value)

        vntr_bp_in_mapped_reads = 0
        vntr_start = self.reference_vntr.start_point
        vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length(
        )
        read_mode = 'r' if alignment_file.endswith('sam') else 'rb'
        samfile = pysam.AlignmentFile(alignment_file, read_mode)
        reference = get_reference_genome_of_alignment_file(samfile)
        chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[
            3:]
        for read in samfile.fetch(chromosome, vntr_start, vntr_end):
            if not recruitment_score:
                read_length = len(read.seq)
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)

            if read.is_unmapped:
                continue
            if len(read.seq) < int(read_length * 0.9):
                logging.debug('Rejecting read for short length: %s' % read.seq)
                continue
            read_end = read.reference_end if read.reference_end else read.reference_start + len(
                read.seq)
            if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end:
                if read.seq.count('N') <= 0:
                    sequence = str(read.seq).upper()
                    logp, vpath = hmm.viterbi(sequence)
                    rev_logp, rev_vpath = hmm.viterbi(
                        str(Seq(read.seq).reverse_complement()).upper())
                    if logp < rev_logp:
                        sequence = str(Seq(
                            read.seq).reverse_complement()).upper()
                        logp = rev_logp
                        vpath = rev_vpath
                    length = len(sequence)
                    if is_low_quality_read(read) and not self.recruit_read(
                            logp, vpath, recruitment_score, length):
                        logging.debug('Rejected Read: %s' % sequence)
                        continue
                    selected_reads.append(
                        SelectedRead(sequence, logp, vpath, read.mapq,
                                     read.reference_start))
                end = min(read_end, vntr_end)
                start = max(read.reference_start, vntr_start)
                vntr_bp_in_mapped_reads += end - start
        logging.debug('vntr base pairs in mapped reads: %s' %
                      vntr_bp_in_mapped_reads)

        if update:
            selected_reads = self.iteratively_update_model(
                alignment_file, unmapped_filtered_reads, selected_reads, hmm)

        return selected_reads
Exemplo n.º 4
0
    def select_illumina_reads(self,
                              alignment_file,
                              unmapped_filtered_reads,
                              update=False,
                              hmm=None):
        recruitment_score = None
        dnn_model = None
        selected_reads = []
        vntr_bp_in_unmapped_reads = Value('d', 0.0)

        number_of_reads = 0
        read_length = 150

        for read_segment in unmapped_filtered_reads:
            if number_of_reads == 0:
                read_length = len(str(read_segment.seq))
            number_of_reads += 1
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)
            if not recruitment_score:
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
                model_file = settings.DNN_MODELS_DIR + '/%s.hd5' % self.reference_vntr.id
                if os.path.exists(model_file):
                    dnn_model = load_model(model_file)

            if len(read_segment.seq) < read_length:
                continue

            if dnn_model is None:
                self.process_unmapped_read(None, str(read_segment.seq), hmm,
                                           recruitment_score,
                                           vntr_bp_in_unmapped_reads,
                                           selected_reads)
            else:
                self.process_unmapped_read_with_dnn(str(read_segment.seq), hmm,
                                                    recruitment_score,
                                                    vntr_bp_in_unmapped_reads,
                                                    selected_reads, True,
                                                    dnn_model)

        logging.debug('vntr base pairs in unmapped reads: %s' %
                      vntr_bp_in_unmapped_reads.value)

        vntr_bp_in_mapped_reads = 0
        vntr_start = self.reference_vntr.start_point
        vntr_end = self.reference_vntr.start_point + self.reference_vntr.get_length(
        )
        read_mode = self.get_alignment_file_read_mode(alignment_file)
        samfile = pysam.AlignmentFile(
            alignment_file,
            read_mode,
            reference_filename=self.reference_filename)
        reference = get_reference_genome_of_alignment_file(samfile)
        chromosome = self.reference_vntr.chromosome if reference == 'HG19' else self.reference_vntr.chromosome[
            3:]
        for read in samfile.fetch(chromosome, vntr_start, vntr_end):
            if not recruitment_score:
                read_length = len(read.seq)
                recruitment_score = self.get_min_score_to_select_a_read(
                    read_length)
            if not hmm:
                hmm = self.get_vntr_matcher_hmm(read_length=read_length)

            if read.is_unmapped:
                continue
            if len(read.seq) < int(read_length * 0.9):
                logging.debug('Rejecting read for short length: %s' % read.seq)
                continue
            read_end = read.reference_end if read.reference_end else read.reference_start + len(
                read.seq)
            if vntr_start - read_length < read.reference_start < vntr_end or vntr_start < read_end < vntr_end:
                if read.seq.count('N') <= 0:
                    sequence = str(read.seq).upper()
                    logp, vpath = hmm.viterbi(sequence)
                    rev_logp, rev_vpath = hmm.viterbi(
                        str(Seq(read.seq).reverse_complement()).upper())
                    if logp < rev_logp:
                        sequence = str(Seq(
                            read.seq).reverse_complement()).upper()
                        logp = rev_logp
                        vpath = rev_vpath
                    if is_low_quality_read(read) or not self.recruit_read(
                            logp, vpath, recruitment_score, sequence):
                        logging.debug('Rejected Aligned Read: %s' % sequence)
                        continue
                    selected_reads.append(
                        SelectedRead(sequence, logp, vpath, read.mapq,
                                     read.reference_start))
                end = min(read_end, vntr_end)
                start = max(read.reference_start, vntr_start)
                vntr_bp_in_mapped_reads += end - start
        logging.debug('vntr base pairs in mapped reads: %s' %
                      vntr_bp_in_mapped_reads)

        if update:
            selected_reads = self.iteratively_update_model(
                alignment_file, unmapped_filtered_reads, selected_reads, hmm)

        return selected_reads