def get_1hots(self, genome_open): seqs1_list = [] # extract reference if self.start < 0: ref_seq = 'N'*(-self.start) + genome_open.fetch(self.chr, 0, self.end).upper() else: ref_seq = genome_open.fetch(self.chr, self.start, self.end).upper() # extend to full length if len(ref_seq) < self.end - self.start: ref_seq += 'N'*(self.end-self.start-len(ref_seq)) # verify reference alleles for snp in self.snps: ref_n = len(snp.ref_allele) ref_snp = ref_seq[snp.seq_pos:snp.seq_pos+ref_n] if snp.ref_allele != ref_snp: print('ERROR: %s does not match reference %s' % (snp, ref_snp), file=sys.stderr) exit(1) # 1 hot code reference sequence ref_1hot = dna_io.dna_1hot(ref_seq) seqs1_list = [ref_1hot] # make alternative 1 hot coded sequences # (assuming SNP is 1-based indexed) for snp in self.snps: alt_1hot = make_alt_1hot(ref_1hot, snp.seq_pos, snp.ref_allele, snp.alt_alleles[0]) seqs1_list.append(alt_1hot) return seqs1_list
def run(self): while True: try: # unload predictions seq_dna, seq_preds, si = self.queue.get() print('Writing %d' % si, flush=True) # seq_preds is (1 + 3*mut_len) x (target_len) x (num_targets) seq_preds = np.array(seq_preds) num_preds = seq_preds.shape[0] num_targets = seq_preds.shape[-1] # reverse engineer mutagenesis position parameters mut_len = (num_preds - 1) // 3 mut_mid = len(seq_dna) // 2 mut_start = mut_mid - mut_len // 2 mut_end = mut_start + mut_len # one hot code mutagenized DNA seq_dna_mut = seq_dna[mut_start:mut_end] seq_1hot_mut = dna_io.dna_1hot(seq_dna_mut) # initialize scores seq_scores = np.zeros((mut_len, 4, num_targets), dtype='float32') # sum across length seq_preds_sum = seq_preds.sum(axis=1, dtype='float32') # predictions index (starting at first mutagenesis) pi = 1 # for each mutated position for mi in range(mut_len): # for each nucleotide for ni in range(4): if seq_1hot_mut[mi, ni]: # reference score seq_scores[mi, ni, :] = seq_preds_sum[0, :] else: # mutation score seq_scores[mi, ni, :] = seq_preds_sum[pi, :] pi += 1 # normalize positions seq_scores -= seq_scores.mean(axis=1, keepdims=True) # write to HDF5 self.scores_h5['scores'][si, :, :, :] = seq_scores.astype( 'float16') self.scores_h5['seqs'][si, :, :] = seq_1hot_mut except: # communicate error print('ERROR: Sequence %d failed' % si, file=sys.stderr, flush=True) # communicate finished task self.queue.task_done()
def fasta_pwm(motif_fasta_file): # read sequences seqs_dna = [ line.rstrip() for line in open(motif_fasta_file) if line[0] != '>' ] seqs_1hot = np.array( [dna_io.dna_1hot(sd, n_uniform=True) for sd in seqs_dna]) num_seqs = seqs_1hot.shape[0] # compute PWM pwm = seqs_1hot.sum(axis=0) + 1 pwm = pwm / (seqs_1hot.shape[0] + 4) return pwm, num_seqs + 4
def end_align_fasta(seqlets_fasta, msa_fasta, shift=1, pwm_iter=1, epochs=2): """Align seqlets in a FASTA file, allowing only end gaps.""" # read seqlet DNA seqs_1hot = [] for line in open(seqlets_fasta): if line[0] != '>': seqlet_dna = line.rstrip() seqlet_1hot = dna_io.dna_1hot(seqlet_dna) seqs_1hot.append(seqlet_1hot) seqs_1hot = np.array(seqs_1hot) num_seqs, width, depth = seqs_1hot.shape # extend with blanks for shifts gaps = 2 * shift gap_col = np.full((num_seqs, shift, depth), np.nan) msa_1hot = np.concatenate([gap_col, seqs_1hot, gap_col], axis=1) for ei in range(epochs): for si in range(num_seqs): if si % pwm_iter == 0: # pwm = (msa_1hot[:,gaps:-gaps,:] + .1).mean(axis=0) pwm = msa_1hot[:, gaps:-gaps, :].sum(axis=0) + 0.1 pwm /= num_seqs # extract sequence seq_1hot = seqs_1hot[si] # score gap positions gap_scores = [] for gi in range(gaps + 1): g1 = gaps - gi g2 = g1 + pwm.shape[0] gap_1hot = seq_1hot[g1:g2] gscore = np.log(pwm[gap_1hot]).sum() gap_scores.append(gscore) # find best gi = np.argmax(gap_scores) gj = width + gaps - (gaps - gi) # set msa msa_1hot[si] = np.nan msa_1hot[si, gi:gj, :] = seq_1hot # write to FASTA write_msa(msa_1hot, msa_fasta) return msa_1hot
def seqs_gen(): for seq_dna in seqs_dna: # 1 hot code DNA seq_1hot = dna_io.dna_1hot(seq_dna) yield {'sequence': seq_1hot} # for mutation positions for mi in range(mut_start, mut_end): # for each nucleotide for ni in range(4): # if non-reference if seq_1hot[mi, ni] == 0: # copy and modify seq_mut_1hot = np.copy(seq_1hot) seq_mut_1hot[mi, :] = 0 seq_mut_1hot[mi, ni] = 1 yield {'sequence': seq_mut_1hot}
def satmut_gen(seqs_dna, mut_start, mut_end): """Construct generator for 1 hot encoded saturation mutagenesis DNA sequences.""" for seq_dna in seqs_dna: # 1 hot code DNA seq_1hot = dna_io.dna_1hot(seq_dna) yield seq_1hot # for mutation positions for mi in range(mut_start, mut_end): # for each nucleotide for ni in range(4): # if non-reference if seq_1hot[mi, ni] == 0: # copy and modify seq_mut_1hot = np.copy(seq_1hot) seq_mut_1hot[mi, :] = 0 seq_mut_1hot[mi, ni] = 1 yield seq_mut_1hot
def segments_1hot(fasta_file, segments, seq_length, stride): """ Read and 1-hot code sequences in their segment batches. Args fasta_file: FASTA genome segments: list of (chrom,start,end) genomic segments to read seq_length: sequence length to break them into stride: distance to advance each sequence Returns: seqs_1hot: You know. seqs_segments: list of (chrom,start,end) sequence segments """ # open fasta fasta = pysam.Fastafile(fasta_file) # initialize 1-hot coding list seqs_1hot = [] # segment corresponding to each sequence seqs_segments = [] for chrom, seg_start, seg_end in segments: # read sequence seg_seq = fasta.fetch(chrom, seg_start, seg_end) # break up into batchable sequences (as above in bigwig_batch) bstart = 0 bend = bstart + seq_length while bend < len(seg_seq): # append seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend])) seqs_segments.append((chrom, seg_start + bstart, seg_start + bend)) # update bstart += stride bend += stride return np.array(seqs_1hot), seqs_segments
def main(): usage = "usage: %prog [options] <fasta_file> <gtf_file> <hdf5_file>" parser = OptionParser(usage) parser.add_option( "-g", dest="genome_file", default=None, help="Chromosome lengths file [Default: %default]", ) parser.add_option( "-l", dest="seq_length", default=1024, type="int", help="Sequence length [Default: %default]", ) parser.add_option( "-c", dest="center_t", default=0.333, type="float", help="Center proportion in which TSSs are required to be [Default: %default]", ) parser.add_option( "-p", dest="processes", default=1, type="int", help="Number parallel processes to load data [Default: %default]", ) parser.add_option( "-t", dest="target_wigs_file", default=None, help="Store target values, extracted from this list of WIG files", ) parser.add_option( "-w", dest="pool_width", type="int", default=1, help="Average pooling width [Default: %default]", ) parser.add_option( "--w5", dest="w5", default=False, action="store_true", help="Coverage files are w5 rather than BigWig [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: parser.error("Must provide genes as GTF, genome FASTA, and output HDF5") else: fasta_file = args[0] gtf_file = args[1] hdf5_file = args[2] if options.target_wigs_file is not None: check_wigs(options.target_wigs_file) ################################################################ # organize TSS's by chromosome # read transcripts transcripts = gff.read_genes(gtf_file, key_id="transcript_id") # read transcript --> gene mapping transcript_genes = gff.t2g(gtf_file, feature="exon") # make gene --> strand mapping gene_strand = {} for tx_id in transcripts: gene_strand[transcript_genes[tx_id]] = transcripts[tx_id].strand # cluster TSSs by gene gene_tss = cluster_tss(transcript_genes, transcripts, options.pool_width / 2) # hash TSS's by chromosome gene_chrom = {} for tx_id in transcripts: gene_id = transcript_genes[tx_id] gene_chrom[gene_id] = transcripts[tx_id].chrom chrom_tss = {} for gene_id in gene_tss: for tss_pos in gene_tss[gene_id]: chrom_tss.setdefault(gene_chrom[gene_id], []).append((tss_pos, gene_id)) # sort TSS's by chromosome for chrom in chrom_tss: chrom_tss[chrom].sort() ################################################################ # determine segments / map transcripts # open fasta (to verify chromosome presence) fasta = pysam.Fastafile(fasta_file) chrom_sizes = OrderedDict() for line in open(options.genome_file): a = line.split() if a[0] in fasta.references: chrom_sizes[a[0]] = int(a[1]) elif a[0] in chrom_tss: print("FASTA missing chromosome - %s" % a[0], file=sys.stderr) del chrom_tss[a[0]] merge_distance = options.center_t * options.seq_length seq_coords = [] tss_list = [] # ordering by options.genome_file allows for easier # bigwig output in downstream scripts. for chrom in chrom_sizes: ctss = chrom_tss.get(chrom, []) left_i = 0 while left_i < len(ctss): # left TSS left_tss = ctss[left_i][0] # right TSS right_i = left_i while ( right_i + 1 < len(ctss) and ctss[right_i + 1][0] - left_tss < merge_distance ): right_i += 1 right_tss = ctss[right_i][0] # determine segment midpoint seg_mid = (left_tss + right_tss) // 2 # extend seg_start = seg_mid - options.seq_length // 2 seg_end = seg_start + options.seq_length # rescue if seg_start < 0 or seg_end >= chrom_sizes[chrom]: if chrom_sizes[chrom] == options.seq_length: seg_start = 0 seg_end = options.seq_length elif chrom_sizes[chrom] > options.seq_length: # also rescuable but not important right now pass # save segment if seg_start >= 0 and seg_end <= chrom_sizes[chrom]: seq_coords.append((chrom, seg_start, seg_end)) # annotate TSS to indexes seq_index = len(seq_coords) - 1 for i in range(left_i, right_i + 1): tss_pos, gene_id = ctss[i] tss = gene.TSS( "TSS%d" % len(tss_list), gene_id, chrom, tss_pos, seq_index, True, gene_strand[gene_id], ) tss_list.append(tss) # update left_i = right_i + 1 ################################################################ # extract target values if options.target_wigs_file: t0 = time.time() # get wig files and labels target_wigs_df = pd.read_table(options.target_wigs_file, index_col=0) target_wigs = OrderedDict() target_labels = [] for i in range(target_wigs_df.shape[0]): target_wig_series = target_wigs_df.iloc[i] target_wigs[target_wig_series.identifier] = target_wig_series.file target_labels.append(target_wig_series.description) # initialize multiprocessing pool pool = multiprocessing.Pool(options.processes) # bigwig_read parameters bwt_params = [ (wig_file, tss_list, seq_coords, options.pool_width) for wig_file in target_wigs.values() ] # pull the target values in parallel if options.w5: tss_targets = pool.starmap(wig5_tss_targets, bwt_params) else: tss_targets = pool.starmap(bigwig_tss_targets, bwt_params) # convert to array tss_targets = np.transpose(np.array(tss_targets)) ################################################################ # extract sequences seqs_1hot = [] for chrom, start, end in seq_coords: seq = fasta.fetch(chrom, start, end) seqs_1hot.append(dna_io.dna_1hot(seq)) seqs_1hot = np.array(seqs_1hot) fasta.close() ################################################################ # save to HDF5 # write to HDF5 hdf5_out = h5py.File(hdf5_file, "w") # store pooling hdf5_out.create_dataset("pool_width", data=options.pool_width, dtype="int") # store gene sequences hdf5_out.create_dataset("seqs_1hot", data=seqs_1hot, dtype="bool") # store genesequence coordinates seq_chrom = np.array([sc[0] for sc in seq_coords], dtype="S") seq_start = np.array([sc[1] for sc in seq_coords]) seq_end = np.array([sc[2] for sc in seq_coords]) hdf5_out.create_dataset("seq_chrom", data=seq_chrom) hdf5_out.create_dataset("seq_start", data=seq_start) hdf5_out.create_dataset("seq_end", data=seq_end) # store TSSs tss_id = np.array([tss.identifier for tss in tss_list], dtype="S") tss_gene = np.array([tss.gene_id for tss in tss_list], dtype="S") tss_chrom = np.array([tss.chrom for tss in tss_list], dtype="S") tss_pos = np.array([tss.pos for tss in tss_list]) tss_seq = np.array([tss.gene_seq for tss in tss_list]) tss_strand = np.array([tss.strand for tss in tss_list], dtype="S") hdf5_out.create_dataset("tss_id", data=tss_id) hdf5_out.create_dataset("tss_gene", data=tss_gene) hdf5_out.create_dataset("tss_chrom", data=tss_chrom) hdf5_out.create_dataset("tss_pos", data=tss_pos) hdf5_out.create_dataset("tss_seq", data=tss_seq) hdf5_out.create_dataset("tss_strand", data=tss_strand) # store targets if options.target_wigs_file: # ids target_ids = np.array([tl for tl in target_wigs.keys()], dtype="S") hdf5_out.create_dataset("target_ids", data=target_ids) # labels target_labels = np.array(target_labels, dtype="S") hdf5_out.create_dataset("target_labels", data=target_labels) # values hdf5_out.create_dataset("tss_targets", data=tss_targets, dtype="float16") hdf5_out.close()
def seqs_gen(): for seq_dna in model_seqs_dna: yield dna_io.dna_1hot(seq_dna)
def main(): usage = 'usage: %prog [options] <scores_file>' parser = OptionParser(usage) parser.add_option('-d', dest='n_components', default=None, type='int', help='PCA n_components [Default: %default]') parser.add_option( '-e', dest='num_estimators', default=100, type='int', help='Number of random forest estimators [Default: %default]') parser.add_option('-g', dest='genome', default='ce11', help='PhyloP and FASTA genome [Default: %default]') parser.add_option('-i', dest='iterations', default=1, type='int', help='Cross-validation iterations [Default: %default]') parser.add_option('-o', dest='out_dir', default='regr_out') parser.add_option( '-p', dest='parallel_threads', default=1, type='int', help= 'Parallel threads passed to scikit-learn n_jobs [Default: %default]') parser.add_option('-r', dest='random_seed', default=44, type='int') parser.add_option('--stat', dest='sad_stat', default='sum', help='HDF5 key stat to consider. [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide ISM scores and PhyloP bigwig file.') else: scores_file = args[0] np.random.seed(options.random_seed) options.genome = options.genome.lower() if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################ # read ISM scores with h5py.File(scores_file, 'r') as h5o: score_chrs = [chrm.decode('UTF-8') for chrm in h5o['chr']] score_starts = h5o['start'][:] score_ends = h5o['end'][:] score_strands = [strand.decode('UTF-8') for strand in h5o['strand']] score_seqs = h5o['seqs'][:] nt_scores = h5o[options.sad_stat][:].astype('float16') num_seqs, mut_len, _, num_targets = nt_scores.shape # reference transform nt_scores_ref = np.reshape(nt_scores[score_seqs], (num_seqs, mut_len, num_targets)) # min/max transform nt_scores_min = nt_scores.min(axis=-2) nt_scores_max = nt_scores.max(axis=-2) pos_mask = (nt_scores_ref > 0) nt_scores_refm = nt_scores_ref.copy() nt_scores_refm[pos_mask] -= nt_scores_min[pos_mask] nt_scores_refm[~pos_mask] -= nt_scores_max[~pos_mask] ################################################################ # read phylop bigwig annotations genome_path = os.environ[options.genome.upper()] fasta_file = '%s/assembly/%s.fa' % (genome_path, options.genome) if options.genome == 'ce11': phylop_file = '%s/phylop/ce11.phyloP26way.bw' % genome_path else: print('Genome PhyloP not found', file=sys.stderr) exit(1) seqs_phylop = [] seqs_phylop_dna1 = [] seqs_phylop_mask = np.ones(num_seqs, dtype='bool') fasta_open = pysam.FastaFile(fasta_file) phylop_open = pyBigWig.open(phylop_file, 'r') for si in range(num_seqs): phylop_chr = score_chrs[si] if not phylop_chr.startswith('chr'): phylop_chr = 'chr%s' % phylop_chr # read values try: seq_phylop = phylop_open.values(phylop_chr, score_starts[si], score_ends[si], numpy=True).astype('float16') # read DNA seq_phylop_dna = fasta_open.fetch(score_chrs[si], score_starts[si], score_ends[si]) seq_phylop_dna1 = dna_io.dna_1hot(seq_phylop_dna) # reverse complement if score_strands[si] == '-': seq_phylop = seq_phylop[::-1] seq_phylop_dna1 = dna_io.hot1_rc(seq_phylop_dna1) # save seqs_phylop.append(seq_phylop) seqs_phylop_dna1.append(seq_phylop_dna1) except RuntimeError: print('Ignoring %s:%d-%d; phylop not found.' % \ (phylop_chr, score_starts[si], score_ends[si]), file=sys.stderr) seqs_phylop_mask[si] = False # filter for valid sequences nt_scores = nt_scores[seqs_phylop_mask] nt_scores_ref = nt_scores_ref[seqs_phylop_mask] nt_scores_refm = nt_scores_refm[seqs_phylop_mask] score_seqs = score_seqs[seqs_phylop_mask] num_seqs = len(score_seqs) # transform PhyloP seqs_phylop = np.array(seqs_phylop, dtype='float32') seqs_phylop = np.nan_to_num(seqs_phylop) seqs_phylop = np.clip(seqs_phylop, -1.5, 5) # verify DNA seqs_phylop_dna1 = np.array(seqs_phylop_dna1) for si in range(num_seqs): seq_diff = np.logical_xor(score_seqs[si], seqs_phylop_dna1[si]) nts_diff = seq_diff.sum() // 2 if nts_diff != 0: pdb.set_trace() ################################################################ # regression # add positions seqs_pos = np.arange(mut_len) seqs_pos = np.tile(seqs_pos, num_seqs) seqs_pos = np.reshape(seqs_pos, (num_seqs, -1, 1)) # flatten everything # seqs_phylop_flat = seqs_phylop.flatten() # seqs_pos_flat = seqs_pos.flatten() # nt_scores_refm_flat = nt_scores_refm.reshape((-1,num_targets)) # num_pos = nt_scores_refm_flat.shape[0] # form matrix # X_scores = nt_scores_refm_flat # if options.n_components is not None: # options.n_components = min(options.n_components, num_targets) # X_scores = PCA(options.n_components).fit_transform(nt_scores_refm_flat) # X_pos = seqs_pos.reshape(num_pos,1) # X = np.concatenate([X_scores,X_pos], axis=1) X = np.concatenate([nt_scores_refm, seqs_pos], axis=-1) X = X.astype('float32') # regressor r2s, pcors = randfor_cv(X, seqs_phylop, iterations=options.iterations, n_estimators=options.num_estimators, random_state=options.random_seed, n_jobs=options.parallel_threads) # save np.save('%s/r2.npy' % options.out_dir, r2s) np.save('%s/pcor.npy' % options.out_dir, pcors) # print stats iterations = len(r2s) stats_out = open('%s/stats.txt' % options.out_dir, 'w') print('R2 %.4f (%.4f)' % (r2s.mean(), r2s.std() / np.sqrt(iterations)), file=stats_out) print('pR %.4f (%.4f)' % (pcors.mean(), pcors.std() / np.sqrt(iterations)), file=stats_out) stats_out.close()
def seqs_gen(): for seq_dna in seqs_dna: # 1 hot code DNA seq_1hot = dna_io.dna_1hot(seq_dna) yield {'sequence': seq_1hot}
def run(self): while True: try: # unload predictions seq_dna, seq_pred_stats, si = self.queue.get() seq_preds_sum, seq_preds_center, seq_preds_scd = seq_pred_stats print('Writing %d' % si, flush=True) # seq_preds_sum is (1 + 3*mut_len) x (num_targets) num_preds, num_targets = seq_preds_sum.shape mut_len = self.mut_end - self.mut_start # one hot code mutagenized DNA seq_dna_mut = seq_dna[self.mut_start:self.mut_end] seq_1hot_mut = dna_io.dna_1hot(seq_dna_mut) # write to HDF5 self.scores_h5['seqs'][si, :, :] = seq_1hot_mut for sad_stat in self.sad_stats: # initialize scores seq_scores = np.zeros((mut_len, 4, num_targets), dtype='float32') # summary stat if sad_stat == 'sum': seq_preds_stat = seq_preds_sum elif sad_stat == 'center': seq_preds_stat = seq_preds_center elif sad_stat == 'scd': seq_preds_stat = seq_preds_scd else: print('Unrecognized summary statistic "%s"' % options.sad_stat) exit(1) # predictions index (starting at first mutagenesis) pi = 1 # for each mutated position for mi in range(mut_len): # for each nucleotide for ni in range(4): if seq_1hot_mut[mi, ni]: # reference score seq_scores[mi, ni, :] = seq_preds_stat[0, :] else: # mutation score seq_scores[mi, ni, :] = seq_preds_stat[pi, :] pi += 1 # normalize positions if sad_stat != 'sqdiff': seq_scores -= seq_scores.mean(axis=1, keepdims=True) # write to HDF5 self.scores_h5[sad_stat][si, :, :, :] = seq_scores.astype( 'float16') except: # communicate error print('ERROR: Sequence %d failed' % si, file=sys.stderr, flush=True) # communicate finished task self.queue.task_done()
def main(): usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>' parser = OptionParser(usage) parser.add_option('-s', dest='start_i', default=0, type='int', help='Sequence start index [Default: %default]') parser.add_option('-e', dest='end_i', default=None, type='int', help='Sequence end index [Default: %default]') parser.add_option('-u', dest='umap_npy', help='Unmappable array numpy file') parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Sequence distribution value to set unmappable positions to, eg 0.25.') (options, args) = parser.parse_args() if len(args) != 4: parser.error('Must provide input arguments.') else: fasta_file = args[0] seqs_bed_file = args[1] seqs_cov_dir = args[2] tfr_file = args[3] ################################################################ # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]))) if options.end_i is None: options.end_i = len(model_seqs) num_seqs = options.end_i - options.start_i ################################################################ # determine sequence coverage files seqs_cov_files = [] ti = 0 seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) while os.path.isfile(seqs_cov_file): seqs_cov_files.append(seqs_cov_file) ti += 1 seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['seqs_cov'].shape[1] num_targets = len(seqs_cov_files) ################################################################ # read targets # initialize targets targets = np.zeros((num_seqs, seq_pool_len, num_targets), dtype='float16') # read each target for ti in range(num_targets): seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r') targets[:, :, ti] = seqs_cov_open['seqs_cov'][ options.start_i:options.end_i, :] seqs_cov_open.close() ################################################################ # modify unmappable if options.umap_npy is not None and options.umap_set is not None: unmap_mask = np.load(options.umap_npy) for si in range(num_seqs): msi = options.start_i + si # determine unmappable null value seq_target_null = np.percentile(targets[si], q=[100 * options.umap_set], axis=0)[0] # set unmappable positions to null targets[si, unmap_mask[msi, :], :] = np.minimum( targets[si, unmap_mask[msi, :], :], seq_target_null) ################################################################ # write TFRecords # open FASTA fasta_open = pysam.Fastafile(fasta_file) # define options tf_opts = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer: for si in range(num_seqs): msi = options.start_i + si mseq = model_seqs[msi] # read FASTA seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end) # one hot code seq_1hot = dna_1hot(seq_dna) # example = tf.train.Example(features=tf.train.Features(feature={ # 'sequence': _bytes_feature(seq_1hot.flatten().tostring()), # 'target': _float_feature(targets[si,:,:].flatten())})) example = tf.train.Example(features=tf.train.Features( feature={ 'sequence': _bytes_feature(seq_1hot.flatten().tostring()), 'target': _bytes_feature(targets[si, :, :].flatten().tostring()) })) writer.write(example.SerializeToString()) fasta_open.close()
def main(): usage = 'usage: %prog [options] <fasta> <tss_gff> <expr_file>' parser = OptionParser(usage) parser.add_option( '-c', dest='cluster_gene_distance', default=2000, type='int', help= 'Cluster genes into the same split within this distance [Default: %default]' ) parser.add_option( '-g', dest='gene_index', default='gene_name', help='Key to match TSS GFF to expression table [Default: %default]') parser.add_option('-l', dest='seq_length', default=65536, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='genes_out') parser.add_option( '-n', dest='n_allowed_pct', default=0.25, type='float', help= 'Proportion of sequence allowed to be Ns on one side [Default: %default]' ) parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '-s', dest='sqrt', default=False, action='store_true', help='Square root the expression values [Default: %default]') parser.add_option( '-t', dest='test_pct_or_chr', default=0.1, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.1, type='str', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('') else: fasta_file = args[0] tss_gff_file = args[1] expr_file = args[2] if os.path.isdir(options.out_dir): print('Remove output directory %s.' % options.out_dir) exit(1) else: os.mkdir(options.out_dir) ################################################################ # read genes and targets genes_raw_df = gff_df(tss_gff_file, options.gene_index) expr_raw_df = pd.read_csv(expr_file, index_col=0) if options.sqrt: expr_raw_df = np.sqrt(expr_raw_df) # filter for shared genes shared_genes = set(genes_raw_df.index) & set(expr_raw_df.index) shared_genes = sorted(shared_genes) print('Shared %d genes of %d described and %d quantified' % \ (len(shared_genes), genes_raw_df.shape[0], expr_raw_df.shape[0])) # align gene info and expression genes_df = genes_raw_df.loc[shared_genes] expr_df = expr_raw_df.loc[shared_genes] assert (genes_df.shape[0] == expr_df.shape[0]) ################################################################ # filter genes from chromosome ends gene_valid_mask = sufficient_sequence(fasta_file, genes_df, options.seq_length, options.n_allowed_pct) genes_df = genes_df.loc[gene_valid_mask] expr_df = expr_df.loc[gene_valid_mask] ################################################################ # divide between train/valid/test # permute genes np.random.seed(44) permute_order = np.random.permutation(genes_df.shape[0]) genes_df = genes_df.iloc[permute_order] expr_df = expr_df.iloc[permute_order] assert ((genes_df.index == expr_df.index).all()) try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct tvt_indexes = divide_genes_pct(genes_df, test_pct, valid_pct, options.cluster_gene_distance) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') tvt_indexes = divide_genes_chr(genes_df, test_chrs, valid_chrs) # write gene sets train_index, valid_index, test_index = tvt_indexes genes_df.iloc[train_index].to_csv('%s/genes_train.csv' % options.out_dir, sep='\t') genes_df.iloc[valid_index].to_csv('%s/genes_valid.csv' % options.out_dir, sep='\t') genes_df.iloc[test_index].to_csv('%s/genes_test.csv' % options.out_dir, sep='\t') # write targets targets_df = pd.DataFrame({ 'identifier': expr_df.columns, 'description': expr_df.columns }) targets_df.index.name = 'index' targets_df.to_csv('%s/targets.txt' % options.out_dir, sep='\t') ################################################################ # write TFRecords tfr_dir = '%s/tfrecords' % options.out_dir os.mkdir(tfr_dir) # open FASTA fasta_open = pysam.Fastafile(fasta_file) # define options tf_opts = tf.io.TFRecordOptions(compression='ZLIB') tvt_tuples = [('train', train_index), ('valid', valid_index), ('test', test_index)] for set_label, set_index in tvt_tuples: genes_set_df = genes_df.iloc[set_index] expr_set_df = expr_df.iloc[set_index] num_set = genes_set_df.shape[0] num_set_tfrs = int(np.ceil(num_set / options.seqs_per_tfr)) # gene sequence index si = 0 for tfr_i in range(num_set_tfrs): tfr_file = '%s/%s-%d.tfr' % (tfr_dir, set_label, tfr_i) print(tfr_file) with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer: # TFR index ti = 0 while ti < options.seqs_per_tfr and si < num_set: gene = genes_set_df.iloc[si] seq_chrm = gene.chr mid_pos = (gene.start + gene.end) // 2 seq_start = mid_pos - options.seq_length // 2 seq_end = seq_start + options.seq_length if seq_start < 0: # fill left side first n_requested = -seq_start seq_dna = ''.join([ random.choice('ACGT') for i in range(n_requested) ]) seq_dna += fasta_open.fetch(seq_chrm, 0, seq_end) else: seq_dna = fasta_open.fetch(seq_chrm, seq_start, seq_end) # fill out right side if len(seq_dna) > 0: n_requested = options.seq_length - len(seq_dna) seq_dna += ''.join([ random.choice('ACGT') for i in range(n_requested) ]) # verify length assert (len(seq_dna) == options.seq_length) # orient if gene.strand == '-': seq_dna = rc(seq_dna) # one hot code seq_1hot = dna_1hot(seq_dna) # get targets targets = expr_set_df.iloc[si].values targets = targets.reshape((1, -1)).astype('float16') # make example example = tf.train.Example(features=tf.train.Features( feature={ 'sequence': _bytes_feature(seq_1hot.flatten().tostring()), 'target': _bytes_feature(targets.flatten().tostring()) })) # write writer.write(example.SerializeToString()) # advance indexes ti += 1 si += 1 fasta_open.close() ################################################################ # stats stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['train_seqs'] = len(train_index) stats_dict['valid_seqs'] = len(valid_index) stats_dict['test_seqs'] = len(test_index) stats_dict['seq_length'] = options.seq_length stats_dict['target_length'] = 1 with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>' parser = OptionParser(usage) parser.add_option('-g', dest='genome_index', default=None, type='int', help='Genome index') parser.add_option('-s', dest='start_i', default=0, type='int', help='Sequence start index [Default: %default]') parser.add_option('-e', dest='end_i', default=None, type='int', help='Sequence end index [Default: %default]') parser.add_option('--te', dest='target_extend', default=None, type='int', help='Extend targets vector [Default: %default]') parser.add_option( '--ts', dest='target_start', default=0, type='int', help='Write targets into vector starting at index [Default: %default') parser.add_option('-u', dest='umap_npy', help='Unmappable array numpy file') parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Sequence distribution value to set unmappable positions to, eg 0.25.') (options, args) = parser.parse_args() if len(args) != 4: parser.error('Must provide input arguments.') else: fasta_file = args[0] seqs_bed_file = args[1] seqs_cov_dir = args[2] tfr_file = args[3] ################################################################ # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None)) if options.end_i is None: options.end_i = len(model_seqs) num_seqs = options.end_i - options.start_i ################################################################ # determine sequence coverage files seqs_cov_files = [] ti = 0 if options.genome_index is None: seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) else: seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir, options.genome_index, ti) while os.path.isfile(seqs_cov_file): seqs_cov_files.append(seqs_cov_file) ti += 1 if options.genome_index is None: seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) else: seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir, options.genome_index, ti) if len(seqs_cov_files) == 0: print('Sequence coverage files not found, e.g. %s' % seqs_cov_file, file=sys.stderr) exit(1) seq_pool_len_hic = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1] num_targets = len(seqs_cov_files) ################################################################ # read targets # extend targets num_targets_tfr = num_targets if options.target_extend is not None: assert (options.target_extend >= num_targets_tfr) num_targets_tfr = options.target_extend # initialize targets targets = np.zeros((num_seqs, seq_pool_len_hic, num_targets_tfr), dtype='float16') # read each target for ti in range(num_targets): seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r') tii = options.target_start + ti targets[:, :, tii] = seqs_cov_open['targets'][ options.start_i:options.end_i, :] seqs_cov_open.close() ################################################################ # write TFRecords # open FASTA fasta_open = pysam.Fastafile(fasta_file) # define options tf_opts = tf.io.TFRecordOptions(compression='ZLIB') with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer: for si in range(num_seqs): msi = options.start_i + si mseq = model_seqs[msi] # read FASTA seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end) # one hot code seq_1hot = dna_1hot(seq_dna) if options.genome_index is None: example = tf.train.Example(features=tf.train.Features( feature={ 'genome': _int_feature(0), 'sequence': _bytes_feature(seq_1hot.flatten().tostring()), 'target': _bytes_feature(targets[si, :, :].flatten().tostring()) })) else: example = tf.train.Example(features=tf.train.Features( feature={ 'genome': _int_feature(options.genome_index), 'sequence': _bytes_feature(seq_1hot.flatten().tostring()), 'target': _bytes_feature(targets[si, :, :].flatten().tostring()) })) writer.write(example.SerializeToString()) fasta_open.close()
def parse_input(input_file, sample): """ Parse an input file that might be FASTA or HDF5. """ try: # input_file is FASTA # read sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) # one hot code sequences seqs_1hot = [] for seq in seqs: seqs_1hot.append(dna_io.dna_1hot(seq)) seqs_1hot = np.array(seqs_1hot) # sample if sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] seqs = seqs[sample_i] # initialize targets variable targets = None except (UnicodeDecodeError): # input_file is HDF5 try: # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) hdf5_in.close() # sample if sample: sample_i = np.array( random.sample(range(seqs_1hot.shape[0]), sample)) seqs_1hot = seqs_1hot[sample_i] targets = targets[sample_i] # convert to ACGT sequences seqs = dna_io.hot1_dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") return seqs, seqs_1hot, targets
def main(): usage = ( "usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>" ) parser = OptionParser(usage) parser.add_option("-g", dest="genome_index", default=None, type="int", help="Genome index") parser.add_option( "-s", dest="start_i", default=0, type="int", help="Sequence start index [Default: %default]", ) parser.add_option( "-e", dest="end_i", default=None, type="int", help="Sequence end index [Default: %default]", ) parser.add_option( "--te", dest="target_extend", default=None, type="int", help="Extend targets vector [Default: %default]", ) parser.add_option( "--ts", dest="target_start", default=0, type="int", help="Write targets into vector starting at index [Default: %default", ) parser.add_option("-u", dest="umap_npy", help="Unmappable array numpy file") parser.add_option( "--umap_set", dest="umap_set", default=None, type="float", help= "Sequence distribution value to set unmappable positions to, eg 0.25.", ) (options, args) = parser.parse_args() if len(args) != 4: parser.error("Must provide input arguments.") else: fasta_file = args[0] seqs_bed_file = args[1] seqs_cov_dir = args[2] tfr_file = args[3] ################################################################ # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None)) if options.end_i is None: options.end_i = len(model_seqs) num_seqs = options.end_i - options.start_i ################################################################ # determine sequence coverage files seqs_cov_files = [] ti = 0 if options.genome_index is None: seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti) else: seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir, options.genome_index, ti) while os.path.isfile(seqs_cov_file): seqs_cov_files.append(seqs_cov_file) ti += 1 if options.genome_index is None: seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti) else: seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir, options.genome_index, ti) if len(seqs_cov_files) == 0: print( "Sequence coverage files not found, e.g. %s" % seqs_cov_file, file=sys.stderr, ) exit(1) seq_pool_len = h5py.File(seqs_cov_files[0], "r")["seqs_cov"].shape[1] num_targets = len(seqs_cov_files) ################################################################ # read targets # extend targets num_targets_tfr = num_targets if options.target_extend is not None: assert options.target_extend >= num_targets_tfr num_targets_tfr = options.target_extend # initialize targets targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr), dtype="float16") # read each target for ti in range(num_targets): seqs_cov_open = h5py.File(seqs_cov_files[ti], "r") tii = options.target_start + ti targets[:, :, tii] = seqs_cov_open["seqs_cov"][ options.start_i:options.end_i, :] seqs_cov_open.close() ################################################################ # modify unmappable if options.umap_npy is not None and options.umap_set is not None: unmap_mask = np.load(options.umap_npy) for si in range(num_seqs): msi = options.start_i + si # determine unmappable null value seq_target_null = np.percentile(targets[si], q=[100 * options.umap_set], axis=0)[0] # set unmappable positions to null targets[si, unmap_mask[msi, :], :] = np.minimum( targets[si, unmap_mask[msi, :], :], seq_target_null) ################################################################ # write TFRecords # open FASTA fasta_open = pysam.Fastafile(fasta_file) # define options tf_opts = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer: for si in range(num_seqs): msi = options.start_i + si mseq = model_seqs[msi] # read FASTA seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end) # one hot code seq_1hot = dna_1hot(seq_dna) if options.genome_index is None: example = tf.train.Example(features=tf.train.Features( feature={ "genome": _int_feature(0), "sequence": _bytes_feature(seq_1hot.flatten().tostring()), "target": _bytes_feature(targets[si, :, :].flatten().tostring()), })) else: example = tf.train.Example(features=tf.train.Features( feature={ "genome": _int_feature(options.genome_index), "sequence": _bytes_feature(seq_1hot.flatten().tostring()), "target": _bytes_feature(targets[si, :, :].flatten().tostring()), })) writer.write(example.SerializeToString()) fasta_open.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>' parser = OptionParser(usage) parser.add_option('-s', dest='start_i', default=0, type='int', help='Sequence start index [Default: %default]') parser.add_option('-e', dest='end_i', default=None, type='int', help='Sequence end index [Default: %default]') parser.add_option('--te', dest='target_extend', default=None, type='int', help='Extend targets vector [Default: %default]') parser.add_option( '--ts', dest='target_start', default=0, type='int', help='Write targets into vector starting at index [Default: %default') parser.add_option('-u', dest='umap_npy', help='Unmappable array numpy file') parser.add_option( '--umap_clip', dest='umap_clip', default=1, type='float', help= 'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]' ) parser.add_option( '--umap_tfr', dest='umap_tfr', default=False, action='store_true', help='Save umap array into TFRecords [Default: %default]') (options, args) = parser.parse_args() if len(args) != 4: parser.error('Must provide input arguments.') else: fasta_file = args[0] seqs_bed_file = args[1] seqs_cov_dir = args[2] tfr_file = args[3] ################################################################ # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None)) if options.end_i is None: options.end_i = len(model_seqs) num_seqs = options.end_i - options.start_i ################################################################ # determine sequence coverage files seqs_cov_files = [] ti = 0 seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) while os.path.isfile(seqs_cov_file): seqs_cov_files.append(seqs_cov_file) ti += 1 seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti) if len(seqs_cov_files) == 0: print('Sequence coverage files not found, e.g. %s' % seqs_cov_file, file=sys.stderr) exit(1) seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1] num_targets = len(seqs_cov_files) ################################################################ # read targets # extend targets num_targets_tfr = num_targets if options.target_extend is not None: assert (options.target_extend >= num_targets_tfr) num_targets_tfr = options.target_extend # initialize targets targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr), dtype='float16') # read each target for ti in range(num_targets): seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r') tii = options.target_start + ti targets[:, :, tii] = seqs_cov_open['targets'][ options.start_i:options.end_i, :] seqs_cov_open.close() ################################################################ # modify unmappable if options.umap_npy is not None and options.umap_clip < 1: unmap_mask = np.load(options.umap_npy) for si in range(num_seqs): msi = options.start_i + si # determine unmappable null value seq_target_null = np.percentile(targets[si], q=[100 * options.umap_clip], axis=0)[0] # set unmappable positions to null targets[si, unmap_mask[msi, :], :] = np.minimum( targets[si, unmap_mask[msi, :], :], seq_target_null) elif options.umap_npy is not None and options.umap_tfr: unmap_mask = np.load(options.umap_npy) ################################################################ # write TFRecords # open FASTA fasta_open = pysam.Fastafile(fasta_file) # define options tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB') with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer: for si in range(num_seqs): msi = options.start_i + si mseq = model_seqs[msi] # read FASTA seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end) # one hot code seq_1hot = dna_1hot(seq_dna) # seq_1hot = dna_1hot_index(seq_dna) # more efficient, but fighting inertia # hash to bytes features_dict = { 'sequence': feature_bytes(seq_1hot), 'target': feature_bytes(targets[si, :, :]) } # add unmappability if options.umap_tfr: features_dict['umap'] = feature_bytes(unmap_mask[msi, :]) # write example example = tf.train.Example(features=tf.train.Features( feature=features_dict)) writer.write(example.SerializeToString()) fasta_open.close()