示例#1
0
 def __init__(self, normal_bam_filename, tumor_bam_filename,
              reference_genome_filename, input_filename_base, segments_bed,
              min_depth=20, min_bqual=10, min_mqual=10, process_num=1):
     self.normal_bam_filename = normal_bam_filename
     self.tumor_bam_filename = tumor_bam_filename
     self.reference_genome_filename = reference_genome_filename
     self.input_filename_base = input_filename_base
     self.segments_bed = segments_bed
     
     self.min_depth = min_depth
     self.min_bqual = min_bqual
     self.min_mqual = min_mqual
     self.process_num = process_num
     
     self.data = Data()
示例#2
0
    def __init__(self,
                 normal_bam_filename,
                 tumor_bam_filename,
                 reference_genome_filename,
                 input_filename_base,
                 segments_bed,
                 BICseq_bed_fileName_corrected,
                 pkl_path="",
                 max_copynumber=6,
                 subclone_num=1,
                 baseline_thred_LOH=0.3,
                 baseline_thred_APM=0.01,
                 min_depth=20,
                 min_bqual=10,
                 min_mqual=10,
                 process_num=1):
        self.normal_bam_filename = normal_bam_filename
        self.tumor_bam_filename = tumor_bam_filename
        self.reference_genome_filename = reference_genome_filename
        self.input_filename_base = input_filename_base
        self.segments_bed = segments_bed
        self.BICseq_bed_fileName_corrected = BICseq_bed_fileName_corrected
        self.pkl_path = pkl_path

        self.max_copynumber = max_copynumber
        self.subclone_num = subclone_num
        self.baseline_thred_LOH = baseline_thred_LOH
        self.baseline_thred_APM = baseline_thred_APM

        self.min_depth = min_depth
        self.min_bqual = min_bqual
        self.min_mqual = min_mqual
        print "process_num = {}".format(process_num)
        self.process_num = process_num

        self.data = Data()
示例#3
0
class MixClone_Converter:
    def __init__(self,
                 normal_bam_filename,
                 tumor_bam_filename,
                 reference_genome_filename,
                 input_filename_base,
                 segments_bed,
                 BICseq_bed_fileName_corrected,
                 pkl_path="",
                 max_copynumber=6,
                 subclone_num=1,
                 baseline_thred_LOH=0.3,
                 baseline_thred_APM=0.01,
                 min_depth=20,
                 min_bqual=10,
                 min_mqual=10,
                 process_num=1):
        self.normal_bam_filename = normal_bam_filename
        self.tumor_bam_filename = tumor_bam_filename
        self.reference_genome_filename = reference_genome_filename
        self.input_filename_base = input_filename_base
        self.segments_bed = segments_bed
        self.BICseq_bed_fileName_corrected = BICseq_bed_fileName_corrected
        self.pkl_path = pkl_path

        self.max_copynumber = max_copynumber
        self.subclone_num = subclone_num
        self.baseline_thred_LOH = baseline_thred_LOH
        self.baseline_thred_APM = baseline_thred_APM

        self.min_depth = min_depth
        self.min_bqual = min_bqual
        self.min_mqual = min_mqual
        print "process_num = {}".format(process_num)
        self.process_num = process_num

        self.data = Data()

    def convert(self, method, pkl_flag=False):
        if pkl_flag and self.pkl_path != "":
            print "load pkl from"
            print self.pkl_path
            infile = open(self.pkl_path, 'rb')
            self.data = pkl.load(infile)
            infile.close()
        else:
            self._load_segments()
            print "MixClone converter converting"

            if "auto" == method:
                self._MCMC_gccorrection()
            elif "visual" == method:
                self._visual_gccorrection()
                sys.stdout.flush()
            self._get_counts()

        self._output()
        self._baseline_selection()

        data_file_name = self.input_filename_base + '.MixClone.input.pkl'
        outfile = open(data_file_name, 'wb')
        pkl.dump(self.data, outfile, protocol=2)

        outfile.close()

    def _MCMC_gccorrection(self):
        """
        The interception is irrelevant for correction, set as median
        MCMCLM only returns the m and c, then correct the data here
        """
        mcmclm = MCMCLM(self.data, 0, self.subclone_num, self.max_copynumber)
        m, c = mcmclm.run()
        print "MCMC slope = {}".format(m)
        self._correct(m, c)

    def _correct(self, slope, intercept):

        x = np.array(map(lambda seg: seg.gc, self.data.segments))
        y = np.array(
            map(
                lambda seg: np.log(seg.tumor_reads_num + 1) - np.log(
                    seg.normal_reads_num + 1), self.data.segments))

        K = np.percentile(y, 50)
        A = slope * x + intercept
        y_corrected = y - A + K

        for i in range(len(y_corrected)):
            self.data.segments[i].tumor_reads_num = np.exp(
                y_corrected[i] +
                np.log(self.data.segments[i].normal_reads_num + 1)) - 1

        print "gc corrected, with slope = {0}, intercept = {1}".\
            format(slope, intercept)

    def _visual_gccorrection(self):
        gsp = GCStripePlot(self.data.segments, self.sampleNumber)
        print "total number: {}".format(self.data.seg_num)

        #       Sampling then linear regression, poor performance
        #       gsp.sampleln([i * 1000 for i in range(1,9)], 100)

        gsp.plot()

        # todo   trimed x, y position
        x, y, m, c = gsp.output()

        print "x, y, m, c"
        print x, y, m, c

        self._correct(m, c)

    def _baseline_selection(self):
        print "begin baseline selection.."
        self._get_LOH_frac()
        self._get_LOH_status()
        self._get_APM_frac()
        self._get_APM_status()
        self._compute_Lambda_S()

    def _get_APM_status(self):
        self.data.get_APM_status(self.baseline_thred_APM)

    def _get_LOH_status(self):
        self.data.get_LOH_status(self.baseline_thred_LOH,
                                 flag_runpreprocess=True)

    def _compute_Lambda_S(self):
        print "begin compute lambda s .."
        self.data.compute_Lambda_S_LOH(self.max_copynumber,
                                       self.subclone_num,
                                       flag_runpreprocess=True)

    def _output(self):
        """Output the parameter for THetA

        The Upper and Lower Boundaries for normal heuristic
        The GC corrected interval_count_file
        """

        interval_count_file = open(self.BICseq_bed_fileName_corrected, 'w')
        interval_count_file.write(
            "ID\tchrm\tstart\tend\ttumorCount\tnormalCount\tgc\n")

        for i in range(len(self.data.segments)):
            ID_i = self.data.segments[i].chrom_idx
            chrm_i = self.data.segments[i].chrom_name
            start_i = self.data.segments[i].start
            end_i = self.data.segments[i].end
            tumorCount_i = self.data.segments[i].tumor_reads_num
            normalCount_i = self.data.segments[i].normal_reads_num
            gc_i = self.data.segments[i].gc
            interval_count_file.write(
                "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                    ID_i, chrm_i, start_i, end_i, tumorCount_i, normalCount_i,
                    gc_i))

        interval_count_file.close()

        print "GC corrected interval file generated!"
        sys.stdout.flush()

    def _load_segmentsn(self):
        """
        :returns: TODO

        """
        normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb')
        tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb')

        print 'Loading normalized segments by {0}...'.format(self.segments_bed)
        sys.stdout.flush()
        self.data.load_segmentsn(normal_bam, tumor_bam, self.segments_bed)

        normal_bam.close()
        tumor_bam.close()

    def _load_segments(self):
        normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb')
        tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb')

        print 'Loading segments with gc by {0}...'.format(self.segments_bed)
        sys.stdout.flush()
        #       self.data.load_segments(normal_bam, tumor_bam, self.segments_bed)
        self.data.load_segmentsn(self.segments_bed)

        normal_bam.close()
        tumor_bam.close()

    def _get_counts(self):
        seg_num = self.data.seg_num
        process_num = self.process_num
        print "process_num = {}".format(process_num)

        if process_num > seg_num:
            process_num = seg_num

        pool = Pool(processes=process_num)

        args_list = []

        for j in range(0, seg_num):
            seg_name = self.data.segments[j].name
            chrom_name = self.data.segments[j].chrom_name
            chrom_idx = self.data.segments[j].chrom_idx
            start = self.data.segments[j].start
            end = self.data.segments[j].end

            args_tuple = (seg_name, chrom_name, chrom_idx, start, end,
                          self.normal_bam_filename, self.tumor_bam_filename,
                          self.reference_genome_filename, self.min_depth,
                          self.min_bqual, self.min_mqual)

            args_list.append(args_tuple)

        counts_tuple_list = pool.map(process_by_segment, args_list)

        for j in range(0, seg_num):
            paired_counts_j, BAF_counts_j = counts_tuple_list[j]

            self.data.segments[j].paired_counts = paired_counts_j
            self.data.segments[j].BAF_counts = BAF_counts_j

    def _get_LOH_frac(self):
        self.data.get_LOH_frac()

    def _get_APM_frac(self):
        self.data.get_APM_frac()
示例#4
0
class BamToDataConverter:
    def __init__(self, normal_bam_filename, tumor_bam_filename,
                 reference_genome_filename, input_filename_base, segments_bed,
                 min_depth=20, min_bqual=10, min_mqual=10, process_num=1):
        self.normal_bam_filename = normal_bam_filename
        self.tumor_bam_filename = tumor_bam_filename
        self.reference_genome_filename = reference_genome_filename
        self.input_filename_base = input_filename_base
        self.segments_bed = segments_bed
        
        self.min_depth = min_depth
        self.min_bqual = min_bqual
        self.min_mqual = min_mqual
        self.process_num = process_num
        
        self.data = Data()
        
    def convert(self):
        self._load_segments()
        
        self._get_counts()
        
        self._get_LOH_frac()
        
        data_file_name = self.input_filename_base + '.MixClone.input.pkl'
        outfile = open(data_file_name, 'wb')
        pkl.dump(self.data, outfile, protocol=2)
        
        outfile.close()
        
    def _load_segments(self):
        normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb')
        tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb')
        
        print 'Loading segments by {0}...'.format(self.segments_bed)
        sys.stdout.flush()
        self.data.load_segments(normal_bam, tumor_bam, self.segments_bed)
        
        normal_bam.close()
        tumor_bam.close()
        
    def _get_counts(self):
        seg_num = self.data.seg_num
        process_num = self.process_num
                
        if process_num > seg_num:
            process_num = seg_num
        
        pool = Pool(processes = process_num)
        
        args_list = []
        
        for j in range(0, seg_num):
            seg_name = self.data.segments[j].name
            chrom_name = self.data.segments[j].chrom_name
            chrom_idx = self.data.segments[j].chrom_idx
            start = self.data.segments[j].start
            end = self.data.segments[j].end
            
            args_tuple = (seg_name, chrom_name, chrom_idx, start, end, self.normal_bam_filename,
                          self.tumor_bam_filename, self.reference_genome_filename,
                          self.min_depth, self.min_bqual, self.min_mqual)
            
            args_list.append(args_tuple)
            
        counts_tuple_list = pool.map(process_by_segment, args_list)
        
        for j in range(0, seg_num):
            paired_counts_j, BAF_counts_j = counts_tuple_list[j]
            
            self.data.segments[j].paired_counts = paired_counts_j
            self.data.segments[j].BAF_counts = BAF_counts_j    
    
    def _get_LOH_frac(self):
        self.data.get_LOH_frac()