def next_seqs(self): """ Construct array of sequences for this stream chunk. """ # extract next sequences from generator seqs_1hot = [] stream_end = self.stream_start + self.stream_size for si in range(self.stream_start, stream_end): try: seqs_1hot.append(self.seqs_gen.__next__()) except StopIteration: continue # initialize ensemble seqs_1hot_ens = [] # add rc/shifts for seq_1hot in seqs_1hot: for shift in self.shifts: seq_1hot_aug = dna_io.hot1_augment(seq_1hot, shift=shift) seqs_1hot_ens.append(seq_1hot_aug) if self.rc: seq_1hot_aug = dna_io.hot1_rc(seq_1hot_aug) seqs_1hot_ens.append(seq_1hot_aug) seqs_1hot_ens = np.array(seqs_1hot_ens, dtype='float32') return seqs_1hot_ens
def main(): usage = 'usage: %prog [options] <scores_file>' parser = OptionParser(usage) parser.add_option('-d', dest='n_components', default=None, type='int', help='PCA n_components [Default: %default]') parser.add_option( '-e', dest='num_estimators', default=100, type='int', help='Number of random forest estimators [Default: %default]') parser.add_option('-g', dest='genome', default='ce11', help='PhyloP and FASTA genome [Default: %default]') parser.add_option('-i', dest='iterations', default=1, type='int', help='Cross-validation iterations [Default: %default]') parser.add_option('-o', dest='out_dir', default='regr_out') parser.add_option( '-p', dest='parallel_threads', default=1, type='int', help= 'Parallel threads passed to scikit-learn n_jobs [Default: %default]') parser.add_option('-r', dest='random_seed', default=44, type='int') parser.add_option('--stat', dest='sad_stat', default='sum', help='HDF5 key stat to consider. [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide ISM scores and PhyloP bigwig file.') else: scores_file = args[0] np.random.seed(options.random_seed) options.genome = options.genome.lower() if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################ # read ISM scores with h5py.File(scores_file, 'r') as h5o: score_chrs = [chrm.decode('UTF-8') for chrm in h5o['chr']] score_starts = h5o['start'][:] score_ends = h5o['end'][:] score_strands = [strand.decode('UTF-8') for strand in h5o['strand']] score_seqs = h5o['seqs'][:] nt_scores = h5o[options.sad_stat][:].astype('float16') num_seqs, mut_len, _, num_targets = nt_scores.shape # reference transform nt_scores_ref = np.reshape(nt_scores[score_seqs], (num_seqs, mut_len, num_targets)) # min/max transform nt_scores_min = nt_scores.min(axis=-2) nt_scores_max = nt_scores.max(axis=-2) pos_mask = (nt_scores_ref > 0) nt_scores_refm = nt_scores_ref.copy() nt_scores_refm[pos_mask] -= nt_scores_min[pos_mask] nt_scores_refm[~pos_mask] -= nt_scores_max[~pos_mask] ################################################################ # read phylop bigwig annotations genome_path = os.environ[options.genome.upper()] fasta_file = '%s/assembly/%s.fa' % (genome_path, options.genome) if options.genome == 'ce11': phylop_file = '%s/phylop/ce11.phyloP26way.bw' % genome_path else: print('Genome PhyloP not found', file=sys.stderr) exit(1) seqs_phylop = [] seqs_phylop_dna1 = [] seqs_phylop_mask = np.ones(num_seqs, dtype='bool') fasta_open = pysam.FastaFile(fasta_file) phylop_open = pyBigWig.open(phylop_file, 'r') for si in range(num_seqs): phylop_chr = score_chrs[si] if not phylop_chr.startswith('chr'): phylop_chr = 'chr%s' % phylop_chr # read values try: seq_phylop = phylop_open.values(phylop_chr, score_starts[si], score_ends[si], numpy=True).astype('float16') # read DNA seq_phylop_dna = fasta_open.fetch(score_chrs[si], score_starts[si], score_ends[si]) seq_phylop_dna1 = dna_io.dna_1hot(seq_phylop_dna) # reverse complement if score_strands[si] == '-': seq_phylop = seq_phylop[::-1] seq_phylop_dna1 = dna_io.hot1_rc(seq_phylop_dna1) # save seqs_phylop.append(seq_phylop) seqs_phylop_dna1.append(seq_phylop_dna1) except RuntimeError: print('Ignoring %s:%d-%d; phylop not found.' % \ (phylop_chr, score_starts[si], score_ends[si]), file=sys.stderr) seqs_phylop_mask[si] = False # filter for valid sequences nt_scores = nt_scores[seqs_phylop_mask] nt_scores_ref = nt_scores_ref[seqs_phylop_mask] nt_scores_refm = nt_scores_refm[seqs_phylop_mask] score_seqs = score_seqs[seqs_phylop_mask] num_seqs = len(score_seqs) # transform PhyloP seqs_phylop = np.array(seqs_phylop, dtype='float32') seqs_phylop = np.nan_to_num(seqs_phylop) seqs_phylop = np.clip(seqs_phylop, -1.5, 5) # verify DNA seqs_phylop_dna1 = np.array(seqs_phylop_dna1) for si in range(num_seqs): seq_diff = np.logical_xor(score_seqs[si], seqs_phylop_dna1[si]) nts_diff = seq_diff.sum() // 2 if nts_diff != 0: pdb.set_trace() ################################################################ # regression # add positions seqs_pos = np.arange(mut_len) seqs_pos = np.tile(seqs_pos, num_seqs) seqs_pos = np.reshape(seqs_pos, (num_seqs, -1, 1)) # flatten everything # seqs_phylop_flat = seqs_phylop.flatten() # seqs_pos_flat = seqs_pos.flatten() # nt_scores_refm_flat = nt_scores_refm.reshape((-1,num_targets)) # num_pos = nt_scores_refm_flat.shape[0] # form matrix # X_scores = nt_scores_refm_flat # if options.n_components is not None: # options.n_components = min(options.n_components, num_targets) # X_scores = PCA(options.n_components).fit_transform(nt_scores_refm_flat) # X_pos = seqs_pos.reshape(num_pos,1) # X = np.concatenate([X_scores,X_pos], axis=1) X = np.concatenate([nt_scores_refm, seqs_pos], axis=-1) X = X.astype('float32') # regressor r2s, pcors = randfor_cv(X, seqs_phylop, iterations=options.iterations, n_estimators=options.num_estimators, random_state=options.random_seed, n_jobs=options.parallel_threads) # save np.save('%s/r2.npy' % options.out_dir, r2s) np.save('%s/pcor.npy' % options.out_dir, pcors) # print stats iterations = len(r2s) stats_out = open('%s/stats.txt' % options.out_dir, 'w') print('R2 %.4f (%.4f)' % (r2s.mean(), r2s.std() / np.sqrt(iterations)), file=stats_out) print('pR %.4f (%.4f)' % (pcors.mean(), pcors.std() / np.sqrt(iterations)), file=stats_out) stats_out.close()