def run(self, sample_id, chr_id, win_size=1000, min_r=0.1, stride_size=200): """ :param sample_id: :param chr_id: :param win_size: :param min_r: :param stride_size: :return: """ ref_gap_chr = self.ref_gap_obj.loc[ self.ref_gap_obj['CHROM'] == 'chr' + chr_id, ['START', 'END']] if not self.ref_gap_obj.empty else None # reference base logger.info( 'Loading reference sequence for sample {} chr: {}...'.format( sample_id, chr_id)) self.rb_base_chr = self.ref_fa_obj.fetch('chr' + chr_id) chr_len = len(self.rb_base_chr) # reference mappabillity logger.info( 'Loading reference mappability for sample {} chr: {}...'.format( sample_id, chr_id)) self.rb_mappability_chr = self.ref_bw_obj.values( 'chr' + chr_id, 0, chr_len - 1) fil_pos = np.array([], dtype=np.int) if ref_gap_chr is not None: for _, i_row in ref_gap_chr.iterrows(): # END is excluded fil_pos = np.concatenate( (fil_pos, np.arange(i_row['START'], i_row['END']))) rb_base_pos = np.ones(chr_len, dtype=int) rb_base_pos[fil_pos] = 0 seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos) assert len(seg_values) == len(seg_starts) == len(seg_lengths) for val_idx, i_val in enumerate(seg_values): if i_val == 0: # gap region yield seg_starts[val_idx], seg_starts[val_idx] + seg_lengths[ val_idx], seg_lengths[val_idx], 0, None else: i_seg_start = seg_starts[val_idx] i_seg_len = seg_lengths[val_idx] if i_seg_len >= win_size: i_start_indices, remain_len = seq_slide( i_seg_len, win_size, stride_size) for i in i_start_indices: i_w_start = i + i_seg_start i_w_end = i_w_start + win_size yield self._get_feats_region(chr_id, i_w_start, i_w_end, win_size, min_r) if remain_len > 0: i_w_start = i_seg_len - win_size i_w_end = i_seg_len yield self._get_feats_region(chr_id, i_w_start, i_w_end, win_size, min_r) else: yield self._get_feats_region(chr_id, i_seg_start, i_seg_start + i_seg_len, win_size, min_r)
def main(args): # input cnv call result global win_size win_size = args.win_size global step_size step_size = args.step_size in_fname = args.fname in_dir = args.i_root_dir out_dir = args.out_root_dir sample_id = args.sample_id chr_id = args.chr_id n_cpus = args.cpus out_type = args.out_type global min_seg_len min_seg_len = 5 in_full_name = os.path.join(in_dir, in_fname) if not os.path.exists(in_full_name): raise FileNotFoundError('file not found {}'.format(in_full_name)) if not os.path.isdir(out_dir): os.mkdir(out_dir) global cnv_df cnv_df = pd.read_csv(in_full_name, sep='\t') cnv_df.loc[cnv_df['p_neu'].values.astype(int) == -1, ['p_neu', 'p_del', 'p_dup']] = np.nan # find predictive region cnv_df['pred_ind'] = np.where(cnv_df['indicator'] == 3, 1, 0) seg_values, seg_starts, seg_lengths = find_seg(cnv_df['pred_ind'].values) predictive_indices = np.where(seg_values == 1)[0] seg_start_pd_row_indices = seg_starts[predictive_indices] # index start at 0, end point should not include seg_end_pd_row_indices = seg_start_pd_row_indices + seg_lengths[ predictive_indices] logger.info('segmenting and merging...') pred_seg_ind_lst = list( zip(seg_start_pd_row_indices, seg_end_pd_row_indices)) len_segs = len(pred_seg_ind_lst) out_res = dict() start_pos = [] end_pos = [] merg_p_neu = [] merg_p_del = [] merg_p_dup = [] merg_pre_l = [] locker = mp.Lock() # with ThreadPool(n_proc) as p, h5py.File(online_out_sample_data_fn, 'w') as h5_out: with mp.Pool(n_cpus, initializer=mp_init, initargs=(locker, )) as p: results = p.imap(multi_run_wrapper, pred_seg_ind_lst) for i, res in enumerate(results): logger.info('finished at {}/{}'.format(i + 1, len_segs)) if res is None: logger.info('{}:{} cannot merge'.format( pred_seg_ind_lst[i][0], pred_seg_ind_lst[i][1])) continue re_start_i, re_end_i, re_merg_p_neu_i, re_merg_p_del_i, re_merg_p_dup_i, re_merg_pre_l_i = res # logger.info(re_start_i) start_pos.extend(re_start_i) end_pos.extend(re_end_i) merg_p_neu.extend(re_merg_p_neu_i) merg_p_del.extend(re_merg_p_del_i) merg_p_dup.extend(re_merg_p_dup_i) merg_pre_l.extend(re_merg_pre_l_i) out_res['POS_S'] = start_pos out_res['POS_E'] = end_pos out_res['LEN'] = np.array(end_pos) - np.array(start_pos) out_res['P_NEU'] = merg_p_neu out_res['P_DEL'] = merg_p_del out_res['P_DUP'] = merg_p_dup out_res['PRED_L'] = merg_pre_l out_cnv_df = pd.DataFrame(data=out_res) un_pred_df = cnv_df.loc[ cnv_df['indicator'] != 3, ['seg_s', 'seg_e', 'p_neu', 'p_del', 'p_dup', 'pred_l', 'indicator']] un_pred_df.loc[(cnv_df['indicator'] == 1) | (cnv_df['indicator'] == 2), 'seg_e'] =\ un_pred_df.loc[(cnv_df['indicator'] == 1) | (cnv_df['indicator'] == 2), 'seg_s'] + 200 un_pred_df['seg_len'] = un_pred_df['seg_e'] - un_pred_df['seg_s'] un_pred_df = un_pred_df[[ 'seg_s', 'seg_e', 'seg_len', 'p_neu', 'p_del', 'p_dup', 'pred_l' ]] whl_cnv_re = np.concatenate((out_cnv_df.values, un_pred_df.values), axis=0) ind = np.argsort(whl_cnv_re[:, 0]) whl_cnv_re = whl_cnv_re[ind] f_out_df = pd.DataFrame( data=whl_cnv_re, columns=['POS_S', 'POS_E', 'LEN', 'P_NEU', 'P_DEL', 'P_DUP', 'PRED_L']) f_out_df['POS_S'] = f_out_df['POS_S'].astype(int) f_out_df['POS_E'] = f_out_df['POS_E'].astype(int) f_out_df['LEN'] = f_out_df['LEN'].astype(int) f_out_df['PRED_L'] = f_out_df['PRED_L'].astype(int) # f_out_df = f_out_df[(f_out_df['PRED_L'] == 1) | (f_out_df['PRED_L'] == 2)] out_cnv_fn = os.path.join( out_dir, 'M{}_{}_{}_{}_out_cnv_{}-rbf_min5.csv'.format(sample_id, chr_id, win_size, step_size, out_type)) if os.path.exists(out_cnv_fn): os.remove(out_cnv_fn) f_out_df.to_csv(out_cnv_fn, index=False, sep='\t') logger.info('Done, the results saved at {}'.format(out_cnv_fn))
def gen_neu_feats(sample_id, chr_id, rb_base_chr, rb_mappability_chr, bam_obj_whole, gap_regions_chr, cnv_regions_chr, out_chr_fname, min_reg_len=2000, n_regions=4): """ :param sample_id: :param chr_id: :param rb_base_chr: :param rb_mappability_chr: :param bam_obj_whole: :param gap_regions_chr: :param cnv_regions_chr: :param out_chr_fname: :param min_reg_len: :param n_regions: :return: """ # find the regions from gap and/or cnvs to be excluded # gaps fil_pos = np.array([], dtype=np.int) if gap_regions_chr: for _, i_row in gap_regions_chr.iterrows(): # END is excluded fil_pos = np.concatenate( (fil_pos, np.arange(i_row['START'], i_row['END']))) # cnv region # NEU features need to exclude the cnv region # but for prediction, cnv regions are not needed to be excluded. if cnv_regions_chr: for _, i_row in cnv_regions_chr.iterrows(): # VCF POS is started with 1, END is also excluded fil_pos = np.concatenate( (fil_pos, np.arange(i_row['POS'] - 1, i_row['END'] - 1))) rb_base_pos = np.ones(len(rb_base_chr), dtype=int) rb_base_pos[fil_pos] = 0 logger.info( 'finding the regions to be generated feature matrix, sample {} chr {}...' .format(sample_id, chr_id)) seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos) assert len(seg_values) == len(seg_starts) == len(seg_lengths) fil_val_idx = np.where(seg_values == 1)[0] fil_len_idx = np.where(seg_lengths >= min_reg_len)[0] fil_idx = list(set(fil_val_idx).intersection(set(fil_len_idx))) t_seg_start = seg_starts[fil_idx] t_seg_len = seg_lengths[fil_idx] frt_idxs = np.argsort(t_seg_len)[:n_regions] logger.info( 'extracting features for neu region len={}, sample {} chr {}...'. format(t_seg_len[frt_idxs], sample_id, chr_id)) for i_idx in frt_idxs: i_start = t_seg_start[i_idx] i_len = t_seg_len[i_idx] i_end = i_start + i_len i_rb_base = rb_base_chr[i_start:i_end] i_ref_map = rb_mappability_chr[i_start:i_end] i_pileup = bam_obj_whole.pileup('chr' + chr_id, start=i_start, stop=i_end, stepper='nofilter', min_base_quality=0, truncate=True) ref_rel_pos, f_mat = gen_feat_region(i_pileup, i_rb_base, i_ref_map, i_start, i_len) with open(out_chr_fname, 'a') as f: f.write('#{},{},{},{},{},{},{}\n'.format(chr_id, i_start, i_end, i_len, 'NEU', 10, ref_rel_pos)) np.savetxt(f, f_mat, fmt='%-10.5f') del f_mat del ref_rel_pos return 'Sample {} chr {}: neu features written to file'.format( sample_id, chr_id)
def cal_feat_segs(self, ref_fasta_fn, online_feat_segs_fn, win_size=1000, n_features=13, min_r=0.1, stride_size=200): if not os.path.exists(ref_fasta_fn): raise FileNotFoundError( 'Reference fasta file does not exist. {}'.format(ref_fasta_fn)) logger.info('loading Reference fasta file...') ref_fa_obj = pysam.FastaFile(ref_fasta_fn) # reference base logger.info( 'Loading reference sequence for sample {} chr: {}...'.format( self.sample_id, self.chr_id)) rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id) chr_len = len(rb_base_chr) ref_gap_obj = load_gap() ref_gap_chr = ref_gap_obj.loc[ ref_gap_obj['CHROM'] == 'chr' + self.chr_id, ['START', 'END']] if not ref_gap_obj.empty else None fil_pos = np.array([], dtype=np.int) if ref_gap_chr is not None: for _, i_row in ref_gap_chr.iterrows(): # END is excluded fil_pos = np.concatenate( (fil_pos, np.arange(i_row['START'], i_row['END']))) rb_base_pos = np.ones(chr_len, dtype=int) rb_base_pos[fil_pos] = 0 seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos) assert len(seg_values) == len(seg_starts) == len(seg_lengths) logger.info( 'segmenting {} bp-long {} bp step feature maps sample {}, chr {}...' .format(win_size, stride_size, self.sample_id, self.chr_id)) logger.info( '>>>>>>>>>>this processing will take a few minutes (almost 180 minutes for chr 1)...' ) self.__chr_segs_unpredictable = np.empty((0, 4), dtype=np.int) self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int) self.__chr_segs_predictable_feats = np.empty((0, win_size, n_features)) for val_idx, i_val in enumerate(seg_values): if i_val == 0: # gap region self.__chr_segs_unpredictable = np.append( self.__chr_segs_unpredictable, np.array([[ seg_starts[val_idx], seg_starts[val_idx] + seg_lengths[val_idx], seg_lengths[val_idx], 0 ]], dtype=np.int), axis=0) else: i_seg_start = seg_starts[val_idx] i_seg_len = seg_lengths[val_idx] if i_seg_len >= win_size: i_start_indices, remain_len = seq_slide( i_seg_len, win_size, stride_size) for i in i_start_indices: i_w_start = int(i + i_seg_start) i_w_end = int(i_w_start + win_size) logger.info('processing at {}'.format(i_w_start)) self.__get_feats_region(i_w_start, i_w_end, win_size, min_r) if remain_len > 0: i_w_start = int(i_seg_len - win_size) i_w_end = int(i_seg_len) self.__get_feats_region(i_w_start, i_w_end, win_size, min_r) else: self.__get_feats_region(i_seg_start, i_seg_start + i_seg_len, win_size, min_r) logger.info( 'saving segments of {} bp-long {} bp step feature maps... {}'. format(win_size, stride_size, online_feat_segs_fn)) np.savez_compressed( online_feat_segs_fn, chr_segs_unpredictable=self.__chr_segs_unpredictable, chr_segs_predictable=self.__chr_segs_predictable, chr_segs_predictable_feats=self.__chr_segs_predictable_feats) logger.info( 'Done, saving the result file at {}'.format(online_feat_segs_fn))
def cal_feat_segs(self, ref_fasta_fn, online_feat_segs_fn, n_features=13, min_r=0.1, stride_size=200): """ :param ref_fasta_fn: :param online_feat_segs_fn: :param n_features: :param min_r: :param stride_size: :return: """ if not os.path.exists(ref_fasta_fn): raise FileNotFoundError( 'Reference fasta file does not exist. {}'.format(ref_fasta_fn)) logger.info('loading Reference fasta file...') ref_fa_obj = pysam.FastaFile(ref_fasta_fn) # reference base logger.info( 'Loading reference sequence for sample {} chr: {}...'.format( self.sample_id, self.chr_id)) rb_base_chr = ref_fa_obj.fetch('chr' + self.chr_id) chr_len = len(rb_base_chr) ref_gap_obj = load_gap() ref_gap_chr = ref_gap_obj.loc[ ref_gap_obj['CHROM'] == 'chr' + self.chr_id, ['START', 'END']] if not ref_gap_obj.empty else None fil_pos = np.array([], dtype=np.int) if ref_gap_chr is not None: for _, i_row in ref_gap_chr.iterrows(): # END is excluded fil_pos = np.concatenate( (fil_pos, np.arange(i_row['START'], i_row['END']))) rb_base_pos = np.ones(chr_len, dtype=int) rb_base_pos[fil_pos] = 0 seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos) assert len(seg_values) == len(seg_starts) == len(seg_lengths) self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int) self.__chr_segs_predictable_feats = np.empty( (0, self.win_size, n_features)) seg_gap_inds = np.where(seg_values == 0)[0] seg_val_len_less_inds = np.where((seg_values == 1) & (seg_lengths < self.win_size))[0] seg_val_normal_inds = np.where((seg_values == 1) & (seg_lengths >= self.win_size))[0] assert len(seg_val_len_less_inds) + len(seg_val_normal_inds) + len( seg_gap_inds) == len(seg_values) logger.info('calculating gap segments for sample {}, chr {}...'.format( self.win_size, stride_size, self.sample_id, self.chr_id)) self.__chr_segs_unpredictable = np.array( [[i_gap_start, i_gap_start + i_gap_lens, i_gap_lens, 0] for i_gap_start, i_gap_lens in zip( seg_starts[seg_gap_inds], seg_lengths[seg_gap_inds])]) self.__chr_segs_predictable = np.empty((0, 4), dtype=np.int) self.__chr_segs_predictable_feats = np.empty( (0, self.win_size, n_features)) logger.info( 'calculating {} bp-long {} bp step feature maps sample {}, chr {}...' .format(self.win_size, stride_size, self.sample_id, self.chr_id)) val_seg_len_less_starts = seg_starts[seg_val_len_less_inds] val_seg_len_less_lens = seg_lengths[seg_val_len_less_inds] val_seg_len_less_end = val_seg_len_less_starts + val_seg_len_less_lens val_seg_poss_zip = list( zip(val_seg_len_less_starts, val_seg_len_less_end)) # slice the seg into win_size val_seg_normal_starts = seg_starts[seg_val_normal_inds] val_seg_normal_lens = seg_lengths[seg_val_normal_inds] val_normal_slice_starts = [ seq_slide(i_seg_len, self.win_size, stride_size) for i_seg_len in val_seg_normal_lens ] assert len(val_seg_normal_starts) == len(val_normal_slice_starts) val_nomarl_slices = [ (val_seg_normal_starts[i] + i_slice_start, val_seg_normal_starts[i] + i_slice_start + self.win_size, end_start, remain_len) for i, (i_slice_start, end_start, remain_len) in enumerate(val_normal_slice_starts) ] for i_seg_norm_starts, i_seg_norm_ends, end_start, remain_len in val_nomarl_slices: val_seg_poss_zip.extend( list(zip(i_seg_norm_starts, i_seg_norm_ends))) if remain_len > 0: val_seg_poss_zip.append((end_start, end_start + remain_len)) logger.info( 'saving segments of {} bp-long {} bp step feature maps... {}'. format(self.win_size, stride_size, online_feat_segs_fn)) for i_w_start, i_w_end in val_seg_poss_zip: i_w_len = i_w_end - i_w_start self.__get_feats_region(i_w_start, i_w_end, i_w_len, min_r) if i_w_start < 55000: logger.info('process at {}'.format(i_w_start)) np.savez_compressed( online_feat_segs_fn, chr_segs_unpredictable=self.__chr_segs_unpredictable, chr_segs_predictable=self.__chr_segs_predictable, chr_segs_predictable_feats=self.__chr_segs_predictable_feats) logger.info( 'Done, saving the result file at {}'.format(online_feat_segs_fn))
def cal_feat_segs(self, sample_id, chr_id, win_size=1000, min_r=0.1, stride_size=200, online_feat_segs_fn=None, n_proc=16): """ calculate feature map segments for given window :param sample_id: :param chr_id: :param win_size: :param min_r: :param stride_size: :param online_feat_segs_fn: :param n_proc: number of processor """ self.sample_id = sample_id self.chr_id = chr_id # assure the feature segmentation list is empty before adding the feature maps self.chr_segs_unpredictable = [] self.chr_segs_predictable = [] self.chr_segs_predictable_feats = [] ref_gap_chr = self.ref_gap_obj.loc[self.ref_gap_obj['CHROM'] == 'chr' + chr_id, ['START', 'END']] if not self.ref_gap_obj.empty else None # reference base logger.info('Loading reference sequence for sample {} chr: {}...'.format(sample_id, chr_id)) self.rb_base_chr = self.ref_fa_obj.fetch('chr' + chr_id) chr_len = len(self.rb_base_chr) # reference mappabillity logger.info('Loading reference mappability for sample {} chr: {}...'.format(sample_id, chr_id)) self.rb_mappability_chr = self.ref_bw_obj.values('chr' + chr_id, 0, chr_len) fil_pos = np.array([], dtype=np.int) if ref_gap_chr is not None: for _, i_row in ref_gap_chr.iterrows(): # END is excluded fil_pos = np.concatenate((fil_pos, np.arange(i_row['START'], i_row['END']))) rb_base_pos = np.ones(chr_len, dtype=int) rb_base_pos[fil_pos] = 0 # call pysam pileup() to read the whole chr logger.info('loading whole bam pileup for sample {} chr: {}...'.format(sample_id, chr_id)) assert self.bam_obj_whole is not None whole_chr_pileup = self.bam_obj_whole.pileup('chr' + chr_id, stepper='nofilter', min_base_quality=0) # get the feature map for the whole chr # take almost 25 minutes for chr1 logger.info('calculating feature maps for whole chromosome, sample {}, chr {}...'.format(sample_id, chr_id)) logger.info('>>>>>>>>>>this processing will take a few minutes (almost 25 minutes for chr 1)...') self.chr_feat_mat_whole = gen_feat_whole_chr(whole_chr_pileup, self.rb_base_chr, chr_len, self.rb_mappability_chr) del whole_chr_pileup del self.rb_base_chr del self.rb_mappability_chr # self.chr_feat_mat_whole = np.zeros((self.n_features, chr_len)) # logger.info('submitting through ThreadPoolExecutor, worker={}...'.format(n_proc)) # with concurrent.futures.ThreadPoolExecutor(max_workers=n_proc) as pool: # # There must use ThreadPoolExecutor. it is not working if using ProcessPoolExecutor # res = [pool.submit(gen_feat_single, pileup_column) for pileup_column in whole_chr_pileup] # for i_re in concurrent.futures.as_completed(res): # i_ref_pos, *i_re_tup = i_re.result() # # # if i_ref_pos % 1000000 == 0: # # logger.info('feature generation at position {}'.format(i_ref_pos)) # # base coverage # self.chr_feat_mat_whole[0, i_ref_pos] = i_re_tup[0] # # base quality # self.chr_feat_mat_whole[1, i_ref_pos] = i_re_tup[1] # # base map quality # self.chr_feat_mat_whole[2, i_ref_pos] = i_re_tup[2] # # base_gc_cnt # self.chr_feat_mat_whole[3, i_ref_pos] = i_re_tup[3] # # base_a_cnt # self.chr_feat_mat_whole[4, i_ref_pos] = i_re_tup[4] # # base_t_cnt # self.chr_feat_mat_whole[5, i_ref_pos] = i_re_tup[5] # # base_c_cnt # self.chr_feat_mat_whole[6, i_ref_pos] = i_re_tup[6] # # base_g_cnt # self.chr_feat_mat_whole[7, i_ref_pos] = i_re_tup[7] # # # reference mappability # self.chr_feat_mat_whole[8, i_ref_pos] = self.rb_mappability_chr[i_ref_pos] # # self.chr_feat_mat_whole[9, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'A' else 0 # self.chr_feat_mat_whole[10, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'T' else 0 # self.chr_feat_mat_whole[11, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'C' else 0 # self.chr_feat_mat_whole[12, i_ref_pos] = 1 if self.rb_base_chr[i_ref_pos] == 'G' else 0 seg_values, seg_starts, seg_lengths = find_seg(rb_base_pos) assert len(seg_values) == len(seg_starts) == len(seg_lengths) logger.info('segmenting {} bp-long {} bp step feature maps sample {}, chr {}...'.format( win_size, stride_size, sample_id, chr_id)) for val_idx, i_val in enumerate(seg_values): if i_val == 0: # gap region self.chr_segs_unpredictable.append(np.array([seg_starts[val_idx], seg_starts[val_idx] + seg_lengths[val_idx], seg_lengths[val_idx], 0])) else: i_seg_start = seg_starts[val_idx] i_seg_len = seg_lengths[val_idx] if i_seg_len >= win_size: i_start_indices, remain_len = seq_slide(i_seg_len, win_size, stride_size) for i in i_start_indices: i_w_start = int(i+i_seg_start) i_w_end = int(i_w_start + win_size) self.__get_feats_region(i_w_start, i_w_end, win_size, min_r) if remain_len > 0: i_w_start = int(i_seg_len-win_size) i_w_end = int(i_seg_len) self.__get_feats_region(i_w_start, i_w_end, win_size, min_r) else: self.__get_feats_region(i_seg_start, i_seg_start+i_seg_len, win_size, min_r) del self.chr_feat_mat_whole gc.collect() self.chr_segs_unpredictable = np.vstack(self.chr_segs_unpredictable) self.chr_segs_predictable = np.vstack(self.chr_segs_predictable) self.chr_segs_predictable_feats = np.array(self.chr_segs_predictable_feats) if online_feat_segs_fn: logger.info('saving segments of {} bp-long {} bp step feature maps... {}'.format(win_size, stride_size, online_feat_segs_fn)) np.savez_compressed(online_feat_segs_fn, chr_segs_unpredictable=self.chr_segs_unpredictable, chr_segs_predictable=self.chr_segs_predictable, chr_segs_predictable_feats=self.chr_segs_predictable_feats)