def test_seqs(self): """Test that the one hot coded sequences match.""" for gi in range(2): # read sequence coordinates seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi) seq_coords = read_seq_coords(seqs_bed_file) # read one hot coding from TF Records train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi) seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str) # check genome self.assertEqual(len(np.unique(genomes)), 1) self.assertEqual(genomes[0], gi) # open FASTA fasta_open = pysam.Fastafile(self.fasta_files[gi]) # check random sequences seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32) for si in seq_indexes: sc = seq_coords[si] seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper() seq_1hot_dna = hot1_dna(seqs_1hot[si]) self.assertEqual(seq_fasta, seq_1hot_dna)
def global_align(seq1_1hot, seq2_1hot): """Align two 1-hot encoded sequences.""" align_opts = { 'gap_open_penalty': 10, 'gap_extend_penalty': 1, 'match_score': 5, 'mismatch_score': -4 } seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot)) seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot)) # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0] seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, gap_open_penalty=10, gap_extend_penalty=1, match_score=5, mismatch_score=-4)[0] seq1_align = str(seq_align[0]) seq2_align = str(seq_align[1]) return seq1_align, seq2_align
def test_seqs(self): """Test that the one hot coded sequences match.""" # read sequence coordinates seqs_bed_file = '%s/sequences.bed' % self.out_dir seq_coords = read_seq_coords(seqs_bed_file) # read one hot coding from TF Records train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir seqs_1hot, _ = self.read_tfrecords(train_tfrs_str) # open FASTA fasta_open = pysam.Fastafile(self.fasta_file) # check random sequences seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32) for si in seq_indexes: sc = seq_coords[si] seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end) seq_1hot_dna = hot1_dna(seqs_1hot[si]) self.assertEqual(seq_fasta, seq_1hot_dna)
def main(): usage = "usage: %prog [options] <tfr_dir> <out_bw>" parser = OptionParser(usage) parser.add_option("-f", dest="fasta_file", default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"]) parser.add_option( "-g", dest="genome_file", default="%s/assembly/ucsc/hg38.human.genome" % os.environ["HG38"], ) parser.add_option( "-l", dest="target_length", default=1024, type="int", help="TFRecord target length [Default: %default]", ) parser.add_option("-s", dest="data_split", default="train") parser.add_option( "-t", dest="target_i", default=0, type="int", help="Target index [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide TF Records directory and output BigWig") else: tfr_dir = args[0] out_bw_file = args[1] # initialize output BigWig out_bw_open = pyBigWig.open(out_bw_file, "w") # construct header header = [] for line in open(options.genome_file): a = line.split() header.append((a[0], int(a[1]))) # write header out_bw_open.addHeader(header) # initialize chr dictionary chr_values = {} for chrm, clen in header: chr_values[chrm] = np.zeros(clen, dtype="float16") # open sequences BED seq_bed_open = open("%s/../sequences0.bed" % tfr_dir) # open FASTA fasta_open = pysam.Fastafile(options.fasta_file) # initialize one shot iterator # next_op = make_next_op('%s/%s-0-0.tfr' % (tfr_dir, options.data_split)) next_op = make_next_op("%s/%s-0-*.tfr" % (tfr_dir, options.data_split)) # read sequence values with tf.Session() as sess: next_datum = sess.run(next_op) while next_datum: # read sequence seq_bed_line = seq_bed_open.readline() a = seq_bed_line.rstrip().split("\t") while a[-1] != options.data_split: seq_bed_line = seq_bed_open.readline() a = seq_bed_line.rstrip().split("\t") chrm = a[0] start = int(a[1]) end = int(a[2]) target_pool = (end - start) // options.target_length # check sequence seq_1hot = next_datum["sequence"].reshape((-1, 4)) seq_1hot_dna = hot1_dna(seq_1hot) seq_fasta = fasta_open.fetch(chrm, start, end).upper() if seq_1hot_dna != seq_fasta: seq_diff = [ seq_1hot_dna[i] != seq_fasta[i] for i in range(len(seq_fasta)) ] seq_diff = np.array(seq_match, dtype="bool") print("WARNING: %s:%d-%d differs by %d nts (%.4f)" % (chrm, start, end, seq_match.sum(), seq_match.mean())) # read targets targets = next_datum["target"].reshape(options.target_length, -1) targets_ti = targets[:, options.target_i] # set values chr_values[chrm][start:end] = np.repeat(targets_ti, target_pool) try: next_datum = sess.run(next_op) except tf.errors.OutOfRangeError: next_datum = False fasta_open.close() # write chr values for chrm, _ in header: print(chrm) out_bw_open.addEntries(chrm, 0, values=chr_values[chrm], span=1, step=1) # close files out_bw_open.close()
def alleles_1hot(gene_seq, seq_1hot, seq_snps): ''' One hot code for gene sequence alleles. ''' # initialize one hot coding aseqs_1hot = [] # add reference allele sequence aseqs_1hot.append(np.copy(seq_1hot)) # set all reference alleles for snp in seq_snps: # determine SNP position wrt sequence snp_seq_pos = snp.pos - 1 - gene_seq.start # verify that the reference allele matches the reference seq_ref = dna_io.hot1_dna(aseqs_1hot[0][snp_seq_pos:snp_seq_pos + len(snp.ref_allele), :]) if seq_ref != snp.ref_allele: print( 'WARNING: %s - ref allele %s does not match reference genome %s; changing reference genome to match.' % (snp.rsid, snp.ref_allele, seq_ref), file=sys.stderr) if len(seq_ref) == len(snp.ref_allele): # SNP dna_io.hot1_set(aseqs_1hot[0], snp_seq_pos, snp.ref_allele) # not confident in these operations # elif len(seq_ref) > len(snp.ref_allele): # # deletion # delete_len = len(seq_ref) - len(snp.ref_allele) # dna_io.hot1_delete(aseqs_1hot[0], snp_seq_pos + 1, delete_len) # else: # # insertion # dna_io.hot1_insert(aseqs_1hot[0], snp_seq_pos + 1, snp.ref_allele[1:]) else: raise Exception( 'ERROR: reference mismatch indels cannot yet be handled.') # for each SNP for snp in seq_snps: # determine SNP position wrt sequence snp_seq_pos = snp.pos - 1 - gene_seq.start # add minor allele sequence aseqs_1hot.append(np.copy(aseqs_1hot[0])) if len(snp.ref_allele) == len(snp.alt_alleles[0]): # SNP dna_io.hot1_set(aseqs_1hot[-1], snp_seq_pos, snp.alt_alleles[0]) elif len(snp.ref_allele) > len(snp.alt_alleles[0]): # deletion delete_len = len(snp.ref_allele) - len(snp.alt_alleles[0]) assert (snp.ref_allele[0] == snp.alt_alleles[0][0]) dna_io.hot1_delete(aseqs_1hot[-1], snp_seq_pos + 1, delete_len) else: # insertion assert (snp.ref_allele[0] == snp.alt_alleles[0][0]) dna_io.hot1_insert(aseqs_1hot[-1], snp_seq_pos + 1, snp.alt_alleles[0][1:]) # finalize aseqs_1hot = np.array(aseqs_1hot) return aseqs_1hot
def parse_input(input_file, sample): """ Parse an input file that might be FASTA or HDF5. """ try: # input_file is FASTA # read sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) # one hot code sequences seqs_1hot = [] for seq in seqs: seqs_1hot.append(dna_io.dna_1hot(seq)) seqs_1hot = np.array(seqs_1hot) # sample if sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] seqs = seqs[sample_i] # initialize targets variable targets = None except (UnicodeDecodeError): # input_file is HDF5 try: # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) hdf5_in.close() # sample if sample: sample_i = np.array( random.sample(range(seqs_1hot.shape[0]), sample)) seqs_1hot = seqs_1hot[sample_i] targets = targets[sample_i] # convert to ACGT sequences seqs = dna_io.hot1_dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") return seqs, seqs_1hot, targets
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option( '-a', dest='act_t', default=0.5, type='float', help= 'Activation threshold (as proportion of max) to consider for PWM [Default: %default]' ) parser.add_option( '-d', dest='plot_density', default=False, action='store_true', help='Plot filter activation density [Default: %default]') parser.add_option( '--heat', dest='plot_heats', default=False, action='store_true', help= 'Plot heat maps describing filter activations in the test sequences [Default: %default]' ) parser.add_option( '-l', dest='seq_length_crop', default=None, type='int', help='Crop sequences to shorter length [Default: %default]') parser.add_option('-o', dest='out_dir', default='basenji_motifs') parser.add_option('-m', dest='meme_db', default='%s/cisbp/Homo_sapiens.meme' % os.environ['HG38'], help='MEME database used to annotate motifs') parser.add_option( '-p', dest='parallel_threads', default=1, type='int', help='Generate weblogos in parallal threads [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='trim_filters', default=False, action='store_true', help= 'Trim uninformative positions off the filter ends [Default: %default]') parser.add_option( '--tfr', dest='tfr_pattern', default='test-*.tfr', help='TFR pattern string appended to data_dir [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide Basenji params and model files and data directory') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ####################################################### # inputs # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] if options.seq_length_crop is not None: params_model['seq_length'] = options.seq_length_crop # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # construct data ops tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern) eval_data = dataset.SeqDataset(tfr_pattern_path, seq_length=data_stats['seq_length'], seq_length_crop=options.seq_length_crop, target_length=data_stats['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL) # obtain sequences eval_seqs_1hot = eval_data.numpy(return_inputs=True, return_outputs=False) eval_seqs_dna = dna_io.hot1_dna(eval_seqs_1hot) del eval_seqs_1hot ################################################################# # model # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) # first layer embedding seqnn_model.build_embed(0) _, preds_length, preds_depth = seqnn_model.embed.output.shape # get weights filter_weights = seqnn_model.get_conv_weights() print(filter_weights.shape) num_filters, _, filter_size = filter_weights.shape # compute filter activations filter_outs = seqnn_model.predict(eval_data) print(filter_outs.shape) ################################################################# # individual filter plots # save information contents filters_ic = [] meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, eval_seqs_dna) # plot weblogo of high scoring outputs (in parallel) if options.parallel_threads > 1: pfl_args = [] for f in range(num_filters): pfl_args.append( (filter_outs[:, :, f], filter_size, eval_seqs_dna, '%s/filter%d_logo' % (options.out_dir, f), options.act_t)) with multiprocessing.get_context('spawn').Pool( options.parallel_threads) as pool: pool.starmap(plot_filter_logo, pfl_args) for f in range(num_filters): print('Filter %d' % f) # plot filter parameters as a heatmap plot_filter_heat(filter_weights[f, :, :], '%s/filter%d_heat.pdf' % (options.out_dir, f)) if options.parallel_threads == 1: plot_filter_logo(filter_outs[:, :, f], filter_size, eval_seqs_dna, '%s/filter%d_logo' % (options.out_dir, f), options.act_t) # write possum motif file # filter_possum(filter_weights[f, :, :], 'filter%d' % f, # '%s/filter%d_possum.txt' % (options.out_dir, # f), options.trim_filters) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' % (options.out_dir, f)) if nsites < 10: # no information filters_ic.append(0) else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters) meme_out.close() ################################################################# # annotate filters ################################################################# # run tomtom subprocess.call( 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db), shell=True) # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.tsv' % options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print('%3s %19s %10s %5s %6s %6s' % header_cols, file=table_out) for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f, :, :]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] f_scores = np.ravel(filter_outs[:, :, f]) fmean, fstd = f_scores.mean(), f_scores.std() if options.plot_density: # plot density of filter output scores plot_score_density(f_scores, '%s/filter%d_dens.pdf' % (options.out_dir, f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print('%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols, file=table_out) table_out.close() ################################################################# # global filter plots ################################################################# # these methods make less sense for longer sequences; # I should fragment the sequences first. if options.plot_heats: # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf' % options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf' % options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf' % options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf' % options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf' % options.out_dir, 'max')