def run_me(args): input_file = args.input_fa model_file_path = args.model_file_path n_jobs = args.n_jobs n_bg = args.n_bg nsamples = args.n_samples output_file_prefix = args.output_file_prefix tempfile = args.tempfile_prefix sequences = [ x.rstrip() for (i, x) in enumerate(open(input_file)) if i % 2 == 1 ] shap_imp_scores = Parallel(n_jobs=args.n_jobs)( delayed(get_shap_explanation)(gkmsvm_model_path=model_file_path, seq=the_seq, background_seqs=[ dinuc_shuffle.dinuc_shuffle(the_seq) for j in range(n_bg) ], tempfile=tempfile + str(i), nsamples=nsamples) for (i, the_seq) in enumerate(sequences)) np.save(output_file_prefix, np.array(shap_imp_scores))
def run_me(args): input_file = args.input_fa model_file_path = args.model_file_path n_jobs = args.n_jobs n_bg = args.n_bg nsamples = args.n_samples output_file = args.output_file tempfile = args.tempfile_prefix sequence_ids = [x.rstrip()[1:] for (i,x) in enumerate(open(input_file)) if i%2==0] sequences = [x.rstrip() for (i,x) in enumerate(open(input_file)) if i%2==1] shap_imp_scores = Parallel(n_jobs=args.n_jobs)( delayed(get_shap_explanation)( gkmsvm_model_path=model_file_path, seq=the_seq, background_seqs=[ dinuc_shuffle.dinuc_shuffle(the_seq) for j in range(n_bg)], tempfile=tempfile+str(i), nsamples=nsamples) for (i,the_seq) in enumerate(sequences)) output_file = open(output_file, 'w') for sequence_id, score in zip(sequence_ids, shap_imp_scores): assert len(score)%2==1 #print the score at the central position, which is # the snp output_file.write(sequence_id+"\t" +str(score[int(len(score)/2)])+"\n") output_file.close()
def create_background(inputs, bg_size=10, seed=1234): input_seq = inputs[0] if len(inputs) == 2: input_seq_bg = [ np.empty((bg_size, ) + input_seq.shape), np.asarray(bg_size * [inputs[1]]) ] elif len(inputs) == 1: input_seq_bg = [np.empty((bg_size, ) + input_seq.shape)] rng = np.random.RandomState(seed) for i in range(bg_size): input_seq_shuf = dinuc_shuffle(np.squeeze(input_seq), rng=rng) input_seq_bg[0][i] = np.expand_dims(input_seq_shuf, axis=0) return input_seq_bg
def test_dinuc_shuffle(self): for i in range(1000): random_sequence = "".join([['A','C','G','T'][int(random.random()*4)] for i in range(200)]) shuffled_seq = dinuc_shuffle(random_sequence) print("sequences") print(random_sequence) print(shuffled_seq) orig_count = dinuc_count(random_sequence) shuffled_count = dinuc_count(shuffled_seq) print("counts") print(orig_count) print(shuffled_count) assert len(orig_count.keys())==len(shuffled_count.keys()) for key in orig_count: assert orig_count[key]==shuffled_count[key]
def test_dinuc_shuffle(self): for i in range(1000): random_sequence = "".join([['A', 'C', 'G', 'T'][int(random.random() * 4)] for i in range(200)]) shuffled_seq = dinuc_shuffle(random_sequence) print("sequences") print(random_sequence) print(shuffled_seq) orig_count = dinuc_count(random_sequence) shuffled_count = dinuc_count(shuffled_seq) print("counts") print(orig_count) print(shuffled_count) assert len(orig_count.keys()) == len(shuffled_count.keys()) for key in orig_count: assert orig_count[key] == shuffled_count[key]
from deeplift.dinuc_shuffle import dinuc_shuffle from dragonn.utils import get_sequence_strings import random import numpy as np import wget url = "http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy" wget.download(url) deep_lift_input_classification_spi1 = np.load( "deep_lift_input_classification_spi1.npy") print(deep_lift_input_classification_spi1.shape) deep_lift_input_classification_spi1_strings = get_sequence_strings( deep_lift_input_classification_spi1) for i in range(len(deep_lift_input_classification_spi1)): random.seed(1234) shuffled_strings = dinuc_shuffle( deep_lift_input_classification_spi1_strings[i]) random.seed(1234) shuffled_array = dinuc_shuffle( deep_lift_input_classification_spi1[i].squeeze()) #decode the array shuffled_array = ''.join( get_sequence_strings( np.expand_dims(np.expand_dims(shuffled_array, axis=1), axis=1))) #make sure shuffling the string and numpy array gave same shuffle output if (shuffled_strings != shuffled_array): print("FAILED!") print("TEST PASSED!")
def shuffle_several_times(s): numshuffles = 20 return [ np.array([dinuc_shuffle(s[0]) for i in range(numshuffles)]), np.array([s[1] for i in range(numshuffles)]) ]
def read_data_sets(train_pct=80,val_pct=10,test_pct=10,mode='random',conv1d=False,add_shuffle=False,seq_only=False,seq_file='../data/yeast_promoters.txt',expr_file='../data/complete_dataset.txt',reg_names_file='../data/reg_names_R.txt'): '''Read data from text files into numpy arrays Args: seq_file: promoter sequence file. expr_file: expression values file. reg_names_file: regulator gene names file. Returns: seq: promoter sequences (one-hot encoding). reg_expr: regulator gene expression (1D array). label: gene expression (scalar). ''' # Read data: reg_names = pd.read_table(reg_names_file, names = ["UID"]) expr_data = pd.read_table(expr_file).fillna(0).drop("GWEIGHT", axis=1) expr_data['NAME']=[x[10:21].strip() for x in expr_data['NAME']] expr_data[expr_data.columns[2:]]=expr_data[expr_data.columns[2:]].astype('float32') promoters = pd.read_table(seq_file, names=["UID", "sequence"]) promoters['real'] = 1 # denotes real promoters promoters.loc[:, "one_hot_sequence"] = [one_hot(seq) for seq in promoters.loc[:, "sequence"]] # Create dinucleotide shuffled sequences: if add_shuffle: random.seed(a=42) shuffled_promoters=[] for i in xrange(promoters.shape[0]): shuffled_promoters.append(dinuc_shuffle.dinuc_shuffle(promoters.sequence[i])) promoters = pd.concat([promoters,pd.DataFrame({'UID':promoters.UID.tolist(),'sequence':shuffled_promoters, 'real':0})]) if seq_only: data_complete=pd.merge(promoters,expr_data,on='UID',how='inner') # train_pct=90 # val_pct=5 # test_pct=5 # mode = 'whole_gene' experiment_name=expr_data.drop(['UID','NAME'],axis=1).columns.tolist() train,val,test=partition(data_complete,(train_pct,val_pct,test_pct), mode=mode) train_data=reformat(train,conv1d,seq_only,experiment_name) if train_pct > 0 else [] val_data=reformat(val,conv1d,seq_only,experiment_name) if val_pct > 0 else [] test_data=reformat(test,conv1d,seq_only,experiment_name) if test_pct > 0 else [] else: # Some transformation: target_expr_data = pd.melt(expr_data, id_vars=["UID","NAME"], var_name="experiment", value_name="expression") reg_data = pd.merge(reg_names, expr_data, on="UID", how="inner").drop("UID", axis=1) reg = pd.DataFrame() for col in range(len(reg_data.columns)): data = np.array([exp_level for exp_level in reg_data.iloc[:, col]]) reg = reg.append(pd.DataFrame({"experiment": reg_data.columns[col], "reg_exp": [data]})) data_complete = pd.merge(promoters, target_expr_data, on="UID", how="inner").merge(reg, on="experiment", how="inner") data_complete.expression=data_complete.expression*data_complete.real # force dinucleotide shuffled sequences to have 0 expression. # train_pct=80 # val_pct=10 # test_pct=10 # mode = 'random' train, val, test = partition(data_complete, (train_pct,val_pct,test_pct), mode=mode) train_data=reformat(train,conv1d) if train_pct > 0 else [] val_data=reformat(val,conv1d) if val_pct > 0 else [] test_data=reformat(test,conv1d) if test_pct > 0 else [] return [train_data,val_data,test_data]
def main(): args = parse_args() chrom_sizes = open(args.chrom_sizes, 'r').read().strip().split('\n') chrom_size_dict = {} for line in chrom_sizes: tokens = line.split('\t') chrom_size_dict[tokens[0]] = int(tokens[1]) ref = pysam.FastaFile(args.ref_fasta) # load the model model = load_model_wrapper(args.model_hdf5) print("loaded model") # create the count & profile explainers model_wrapper = (model.input, model.outputs[1][:, 0:1]) count_explainer = shap.DeepExplainer( model_wrapper, data=create_background_atac, combine_mult_and_diffref=combine_mult_and_diffref_atac) prof_explainer = create_explainer(model, ischip=False, task_index=0) print("made explainers") #read in the peaks peaks = pd.read_csv(args.peak_file, header=None, sep='\t') nrow = peaks.shape[0] tosample = round(int(args.npeaks_to_sample) / nrow, 2) peaks = peaks.sample(frac=tosample).reset_index(drop=True) nrow = peaks.shape[0] print("sampled peaks:" + str(nrow)) #allocate space for numpy arrays for modisco hypothetical_profile_scores = np.empty((nrow, 2 * args.flank_size, 4)) hypothetical_count_scores = np.empty((nrow, 2 * args.flank_size, 4)) observed_profile_scores = np.empty((nrow, 2 * args.flank_size, 4)) observed_count_scores = np.empty((nrow, 2 * args.flank_size, 4)) seq = np.empty((nrow, 2 * args.flank_size, 4)) print("pre-allocted output arrays") #generate one-hot-encoded inputs start_index = 0 while start_index < nrow: cur_batch_size = min(args.batch_size, nrow - start_index) print(str(start_index) + ":" + str(start_index + cur_batch_size)) batch_chroms = peaks[0][start_index:start_index + cur_batch_size].tolist() batch_start_pos = peaks[1] + peaks[9] - args.seq_input_flank_size batch_start_pos = batch_start_pos.tolist() batch_start_pos = [max(0, i) for i in batch_start_pos] batch_start_pos = [ min( batch_start_pos[i], chrom_size_dict[batch_chroms[i]] - 2 * args.seq_input_flank_size) for i in range(cur_batch_size) ] seq_batch = [ ref.fetch(batch_chroms[i], batch_start_pos[i], batch_start_pos[i] + 2 * args.seq_input_flank_size) for i in range(cur_batch_size) ] if args.dinuc_shuffle_input is True: seq_batch = [dinuc_shuffle(i) for i in seq_batch] seq_batch = one_hot_encode(seq_batch) seq[start_index:start_index + cur_batch_size, :, :] = seq_batch[:, args.seq_input_flank_size - args.flank_size:args. seq_input_flank_size + args.flank_size, :] #get the hypothetical scores for the batch hypothetical_profile_scores[ start_index:start_index + cur_batch_size, :, :] = prof_explainer( seq_batch, None)[:, args.seq_input_flank_size - args.flank_size:args.seq_input_flank_size + args.flank_size, :] observed_profile_scores[ start_index:start_index + cur_batch_size, :, :] = hypothetical_profile_scores[ start_index:start_index + cur_batch_size, :, :] * seq_batch[:, args.seq_input_flank_size - args.flank_size:args. seq_input_flank_size + args.flank_size, :] hypothetical_count_scores[start_index:start_index + cur_batch_size, :, :] = np.squeeze( count_explainer.shap_values(seq_batch)[0] )[:, args.seq_input_flank_size - args.flank_size:args.seq_input_flank_size + args.flank_size, :] observed_count_scores[ start_index:start_index + cur_batch_size, :, :] = hypothetical_count_scores[ start_index:start_index + cur_batch_size, :, :] * seq_batch[:, args.seq_input_flank_size - args.flank_size:args. seq_input_flank_size + args.flank_size, :] start_index += args.batch_size #save print("saving outputs") np.save(args.out_prefix + '.hyp.profile.npy', hypothetical_profile_scores) np.save(args.out_prefix + '.observed.profile.npy', observed_profile_scores) np.save(args.out_prefix + '.hyp.count.npy', hypothetical_count_scores) np.save(args.out_prefix + '.observed.count.npy', observed_count_scores) np.save(args.out_prefix + '.seq.npy', seq)
prom_seq = [] prom_shuf = [] label = [] for genome in os.listdir('/nam-99/ablage/nam/peleke/snp_promoters'): genome_path = '/nam-99/ablage/nam/peleke/snp_promoters/' + genome ecotype_id = genome.split('.')[0] genome_key = 'X' + ecotype_id if genome_key in genomekeys_to_normcounts: for rec in SeqIO.parse(genome_path, 'fasta'): ID = rec.id.split(':')[1] seq = str(rec.seq) if ID == 'AT1G01140': prom_seq.append(seq) prom_shuf.append(dinuc_shuffle(seq, rng=np.random.RandomState(seed=42))) if norm_counts[genome_key][ID] == 0: label.append(0) else: label.append(1) print('Encoding sequences') encoded_seq = np.array([one_hot(prom) for prom in prom_seq]) encoded_seq = np.expand_dims(encoded_seq, 3) encoded_shuf_seq = np.array([one_hot(prom) for prom in prom_shuf]) encoded_shuf_seq = np.expand_dims(encoded_shuf_seq, 3) categories = np_utils.to_categorical(label, 2) model = models.load_model('/nam-99/ablage/nam/peleke/Thesis_models/model2020-10-06073328.h5') predictions = np.argmax(model.predict(encoded_seq), axis=1)
def one_hot_encoder(seq): one_hot_encoding = np.zeros(shape=(4, len(seq))) for i, nt in enumerate(seq): one_hot_encoding[:, i] = codes[nt] return one_hot_encoding encoded_proms = [] encoded_shuff_proms = [] label = [0 if x < 1 else 1 for x in exp_count] classes = np_utils.to_categorical(label, num_classes=2) for promoter in proms_all_genomes: shuf_prom = dinuc_shuffle(promoter) encoded_proms.append(one_hot_encoder(promoter)) encoded_shuff_proms.append(one_hot_encoder(shuf_prom)) prepared_proms = np.expand_dims(np.array(encoded_proms, dtype=np.float32), axis=3) prepared_shuff_proms = np.expand_dims(np.array(encoded_shuff_proms, dtype=np.float32), axis=3) print(prepared_proms.shape) print(prepared_shuff_proms.shape) print(classes.shape) model = models.load_model('/nam-99/ablage/nam/peleke/Models/model2020-07-30150217.h5') predictions = np.argmax(model.predict(prepared_proms), axis=1) print(predictions) actual = np.argmax(classes, axis=1)
tensorflow.set_random_seed(42) data = pd.read_csv('Data.csv') data.set_index('Gene_id', inplace=True) geneIDs = [] categories = [] seqs = [] dinuc_shuf_seqs = [] for prec, trec in zip(SeqIO.parse('promoter.fa', 'fasta'), SeqIO.parse('terminators.fa', 'fasta')): ID = prec.id pseq = str(prec.seq) pshuff = dinuc_shuffle(pseq) tseq = str(trec.seq) tshuff = dinuc_shuffle(tseq) seq = pseq + tseq shuff_seq = pshuff + tshuff if ID in data.index: seqs.append(seq) dinuc_shuf_seqs.append(shuff_seq) geneIDs.append(ID) category = data['label'][ID] if category == 'expressed': categories.append(1)