def jcnt_to_cncnt( args ): reader = JointCountsReader( args.jcnt_file_name ) chr_list = reader.get_chr_list() cncnt_file = ConanCountsFile( args.cncnt_file_name, 'w' ) segment_reader = csv.reader( open( args.segment_file_name ), delimiter='\t' ) for row in segment_reader: print row chr_name = row[0] if chr_name == '23': chr_name = 'X' if chr_name == '24': chr_name = 'Y' start = int( row[1] ) stop = int( row[2] ) cn_status = row[3] if cn_status == '7': cn_status = '1' elif cn_status == '8': cn_status = '2' elif cn_status == '9': cn_status = '4' elif cn_status == '10': cn_status = '5' elif cn_status == '11': cn_status = '6' if chr_name not in chr_list: continue rows = reader.get_rows( chr_name ) segment_indices = np.logical_and( rows['position'] >= start, rows['position'] <= stop ) segment_rows = rows[segment_indices] if len( segment_rows ) == 0: continue cncnt_file.add_rows( cn_status, chr_name, segment_rows ) reader.close() cncnt_file.close()
class JointModelRunner(ModelRunner): def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = JointSnvMixWriter(args.jsm_file_name) ModelRunner.run(self, args) def _train(self, args): if args.subsample_size > 0: counts = self._subsample(args.subsample_size) else: counts = self.reader.get_counts() self.priors_parser.load_from_file(args.priors_file) self.priors = self.priors_parser.to_dict() self._write_priors() data = JointData(counts) self.parameters = self.model.train(data, self.priors, args.max_iters, args.convergence_threshold) def _classify_chromosome(self, chr_name): counts = self.reader.get_counts(chr_name) jcnt_rows = self.reader.get_rows(chr_name) end = self.reader.get_chr_size(chr_name) n = int(1e5) start = 0 stop = min(n, end) while start < end: sub_counts = counts[start:stop] sub_rows = jcnt_rows[start:stop] data = JointData(sub_counts) resp = self.model.classify(data, self.parameters) self.writer.write_data(chr_name, sub_rows, resp) start = stop stop = min(stop + n, end)
def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = csv.writer(open(args.tsv_file_name, 'w'), delimiter='\t') chr_list = self.reader.get_chr_list() for chr_name in sorted(chr_list): self._classify_chromosome(chr_name) self.reader.close()
class FisherRunner(object): def __init__(self): self.data_class = JointData self.classes = ('Reference', 'Germline', 'Somatic', 'LOH', 'Unknown') def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = csv.writer(open(args.tsv_file_name, 'w'), delimiter='\t') chr_list = self.reader.get_chr_list() for chr_name in sorted(chr_list): self._classify_chromosome(chr_name) self.reader.close() def _classify_chromosome(self, chr_name): counts = self.reader.get_counts(chr_name) jcnt_rows = self.reader.get_rows(chr_name) end = self.reader.get_chr_size(chr_name) n = int(1e5) start = 0 stop = min(n, end) while start < end: sub_counts = counts[start:stop] sub_rows = jcnt_rows[start:stop] data = self.data_class(sub_counts) labels = self.model.classify(data) self._write_rows(chr_name, sub_rows, labels) start = stop stop = min(stop + n, end) def _write_rows(self, chr_name, rows, labels): for i, row in enumerate(rows): out_row = [chr_name] out_row.extend(row) label = int(labels[i]) class_name = self.classes[label] out_row.append(class_name) if class_name == 'Somatic': print out_row self.writer.writerow(out_row)
def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = JointSnvMixWriter(args.jsm_file_name) ModelRunner.run(self, args)
class ChromosomeModelRunner(ModelRunner): def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = JointSnvMixWriter(args.jsm_file_name) ModelRunner.run(self, args) def _train(self, args): self.priors_parser.load_from_file(args.priors_file) self.priors = self.priors_parser.to_dict() self._write_priors() chr_list = self.reader.get_chr_list() self.parameters = {} for chr_name in sorted(chr_list): print chr_name if args.subsample_size > 0: counts = self._chrom_subsample(chr_name, args.subsample_size) else: counts = self.reader.get_counts(chr_name) data = self.data_class(counts) self.parameters[chr_name] = self.model.train(data, self.priors, args.max_iters, args.convergence_threshold) def _classify_chromosome(self, chr_name): counts = self.reader.get_counts(chr_name) jcnt_rows = self.reader.get_rows(chr_name) end = self.reader.get_chr_size(chr_name) n = int(1e5) start = 0 stop = min(n, end) while start < end: sub_counts = counts[start:stop] sub_rows = jcnt_rows[start:stop] data = self.data_class(sub_counts) resp = self.model.classify(data, self.parameters[chr_name]) self.writer.write_data(chr_name, sub_rows, resp) start = stop stop = min(stop + n, end) def _chrom_subsample(self, chr_name, sample_size): chr_size = self.reader.get_chr_size(chr_name=chr_name) sample_size = min(chr_size, sample_size) chr_sample_indices = random.sample(xrange(chr_size), sample_size) chr_counts = self.reader.get_counts(chr_name) chr_sample = chr_counts[chr_sample_indices] return chr_sample
class IndependentModelRunner(ModelRunner): def run(self, args): self.reader = JointCountsReader(args.jcnt_file_name) self.writer = JointSnvMixWriter(args.jsm_file_name) ModelRunner.run(self, args) def _train(self, args): if args.subsample_size > 0: counts = self._subsample(args.subsample_size) else: counts = self.reader.get_counts() self.priors_parser.load_from_file(args.priors_file) self.priors = self.priors_parser.to_dict() self._write_priors() self.parameters = {} for genome in constants.genomes: data = IndependentData(counts, genome) self.parameters[genome] = self.model.train(data, self.priors[genome], args.max_iters, args.convergence_threshold) def _classify_chromosome(self, chr_name): counts = self.reader.get_counts(chr_name) jcnt_rows = self.reader.get_rows(chr_name) end = self.reader.get_chr_size(chr_name) n = int(1e5) start = 0 stop = min(n, end) while start < end: sub_counts = counts[start:stop] sub_rows = jcnt_rows[start:stop] indep_resp = {} for genome in constants.genomes: data = IndependentData(sub_counts, genome) indep_resp[genome] = self.model.classify(data, self.parameters[genome]) joint_resp = self._get_joint_responsibilities(indep_resp) self.writer.write_data(chr_name, sub_rows, joint_resp) start = stop stop = min(stop + n, end) def _get_joint_responsibilities(self, resp): normal_resp = np.log(resp['normal']) tumour_resp = np.log(resp['tumour']) n = normal_resp.shape[0] nclass_normal = normal_resp.shape[1] column_shape = (n, 1) log_resp = [] for i in range(nclass_normal): log_resp.append(normal_resp[:, i].reshape(column_shape) + tumour_resp) log_resp = np.hstack(log_resp) resp = np.exp(log_resp) return resp