Exemplo n.º 1
0
def run_me(args):
    input_file = args.input_fa
    model_file_path = args.model_file_path
    n_jobs = args.n_jobs
    n_bg = args.n_bg
    nsamples = args.n_samples
    output_file_prefix = args.output_file_prefix
    tempfile = args.tempfile_prefix

    sequences = [
        x.rstrip() for (i, x) in enumerate(open(input_file)) if i % 2 == 1
    ]

    shap_imp_scores = Parallel(n_jobs=args.n_jobs)(
        delayed(get_shap_explanation)(gkmsvm_model_path=model_file_path,
                                      seq=the_seq,
                                      background_seqs=[
                                          dinuc_shuffle.dinuc_shuffle(the_seq)
                                          for j in range(n_bg)
                                      ],
                                      tempfile=tempfile + str(i),
                                      nsamples=nsamples)
        for (i, the_seq) in enumerate(sequences))

    np.save(output_file_prefix, np.array(shap_imp_scores))
def run_me(args):
    input_file = args.input_fa
    model_file_path = args.model_file_path
    n_jobs = args.n_jobs
    n_bg = args.n_bg 
    nsamples = args.n_samples
    output_file = args.output_file
    tempfile = args.tempfile_prefix

    sequence_ids = [x.rstrip()[1:] for
                    (i,x) in enumerate(open(input_file)) if i%2==0]
    sequences = [x.rstrip() for (i,x) in enumerate(open(input_file))
                 if i%2==1]

    shap_imp_scores = Parallel(n_jobs=args.n_jobs)(
        delayed(get_shap_explanation)(
          gkmsvm_model_path=model_file_path,
          seq=the_seq,
          background_seqs=[
              dinuc_shuffle.dinuc_shuffle(the_seq)
              for j in range(n_bg)],
          tempfile=tempfile+str(i),
          nsamples=nsamples)
        for (i,the_seq) in enumerate(sequences))

    output_file = open(output_file, 'w')
    for sequence_id, score in zip(sequence_ids, shap_imp_scores):
        assert len(score)%2==1
        #print the score at the central position, which is
        # the snp
        output_file.write(sequence_id+"\t"
                          +str(score[int(len(score)/2)])+"\n") 
    output_file.close()
Exemplo n.º 3
0
def create_background(inputs, bg_size=10, seed=1234):
    input_seq = inputs[0]
    if len(inputs) == 2:
        input_seq_bg = [
            np.empty((bg_size, ) + input_seq.shape),
            np.asarray(bg_size * [inputs[1]])
        ]
    elif len(inputs) == 1:
        input_seq_bg = [np.empty((bg_size, ) + input_seq.shape)]
    rng = np.random.RandomState(seed)
    for i in range(bg_size):
        input_seq_shuf = dinuc_shuffle(np.squeeze(input_seq), rng=rng)
        input_seq_bg[0][i] = np.expand_dims(input_seq_shuf, axis=0)
    return input_seq_bg
Exemplo n.º 4
0
 def test_dinuc_shuffle(self):
     for i in range(1000):
         random_sequence = "".join([['A','C','G','T'][int(random.random()*4)]
                                 for i in range(200)])
         shuffled_seq = dinuc_shuffle(random_sequence)
         print("sequences")
         print(random_sequence)
         print(shuffled_seq)
         orig_count = dinuc_count(random_sequence)
         shuffled_count = dinuc_count(shuffled_seq)
         print("counts")
         print(orig_count)
         print(shuffled_count)
         assert len(orig_count.keys())==len(shuffled_count.keys())
         for key in orig_count:
             assert orig_count[key]==shuffled_count[key]
Exemplo n.º 5
0
 def test_dinuc_shuffle(self):
     for i in range(1000):
         random_sequence = "".join([['A', 'C', 'G',
                                     'T'][int(random.random() * 4)]
                                    for i in range(200)])
         shuffled_seq = dinuc_shuffle(random_sequence)
         print("sequences")
         print(random_sequence)
         print(shuffled_seq)
         orig_count = dinuc_count(random_sequence)
         shuffled_count = dinuc_count(shuffled_seq)
         print("counts")
         print(orig_count)
         print(shuffled_count)
         assert len(orig_count.keys()) == len(shuffled_count.keys())
         for key in orig_count:
             assert orig_count[key] == shuffled_count[key]
Exemplo n.º 6
0
from deeplift.dinuc_shuffle import dinuc_shuffle
from dragonn.utils import get_sequence_strings
import random
import numpy as np

import wget
url = "http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy"
wget.download(url)
deep_lift_input_classification_spi1 = np.load(
    "deep_lift_input_classification_spi1.npy")
print(deep_lift_input_classification_spi1.shape)
deep_lift_input_classification_spi1_strings = get_sequence_strings(
    deep_lift_input_classification_spi1)

for i in range(len(deep_lift_input_classification_spi1)):
    random.seed(1234)
    shuffled_strings = dinuc_shuffle(
        deep_lift_input_classification_spi1_strings[i])
    random.seed(1234)
    shuffled_array = dinuc_shuffle(
        deep_lift_input_classification_spi1[i].squeeze())
    #decode the array
    shuffled_array = ''.join(
        get_sequence_strings(
            np.expand_dims(np.expand_dims(shuffled_array, axis=1), axis=1)))
    #make sure shuffling the string and numpy array gave same shuffle output
    if (shuffled_strings != shuffled_array):
        print("FAILED!")
print("TEST PASSED!")
Exemplo n.º 7
0
def shuffle_several_times(s):
    numshuffles = 20
    return [
        np.array([dinuc_shuffle(s[0]) for i in range(numshuffles)]),
        np.array([s[1] for i in range(numshuffles)])
    ]
Exemplo n.º 8
0
def read_data_sets(train_pct=80,val_pct=10,test_pct=10,mode='random',conv1d=False,add_shuffle=False,seq_only=False,seq_file='../data/yeast_promoters.txt',expr_file='../data/complete_dataset.txt',reg_names_file='../data/reg_names_R.txt'):
	'''Read data from text files into numpy arrays
		Args:
		seq_file: promoter sequence file.
		expr_file: expression values file. 
		reg_names_file: regulator gene names file. 

		Returns:
		seq: promoter sequences (one-hot encoding).
		reg_expr: regulator gene expression (1D array).
		label: gene expression (scalar). 
	'''
	# Read data: 
	reg_names = pd.read_table(reg_names_file, names = ["UID"])
	expr_data = pd.read_table(expr_file).fillna(0).drop("GWEIGHT", axis=1)
	expr_data['NAME']=[x[10:21].strip() for x in expr_data['NAME']]
	expr_data[expr_data.columns[2:]]=expr_data[expr_data.columns[2:]].astype('float32')
	promoters = pd.read_table(seq_file, names=["UID", "sequence"])
	promoters['real'] = 1 # denotes real promoters
	promoters.loc[:, "one_hot_sequence"] = [one_hot(seq) for seq in promoters.loc[:, "sequence"]]


	# Create dinucleotide shuffled sequences:
	if add_shuffle:
		random.seed(a=42)
		shuffled_promoters=[]
		for i in xrange(promoters.shape[0]):
			shuffled_promoters.append(dinuc_shuffle.dinuc_shuffle(promoters.sequence[i]))
		promoters = pd.concat([promoters,pd.DataFrame({'UID':promoters.UID.tolist(),'sequence':shuffled_promoters, 'real':0})])


	if seq_only:
		data_complete=pd.merge(promoters,expr_data,on='UID',how='inner')

		# train_pct=90
		# val_pct=5
		# test_pct=5
		# mode = 'whole_gene'
		experiment_name=expr_data.drop(['UID','NAME'],axis=1).columns.tolist()

		train,val,test=partition(data_complete,(train_pct,val_pct,test_pct), mode=mode)

		train_data=reformat(train,conv1d,seq_only,experiment_name) if train_pct > 0 else []
		val_data=reformat(val,conv1d,seq_only,experiment_name) if val_pct > 0 else []
		test_data=reformat(test,conv1d,seq_only,experiment_name) if test_pct > 0 else []

	else:
		# Some transformation: 
		target_expr_data = pd.melt(expr_data, id_vars=["UID","NAME"], var_name="experiment", value_name="expression")
		reg_data = pd.merge(reg_names, expr_data, on="UID", how="inner").drop("UID", axis=1)


		reg = pd.DataFrame()
		for col in range(len(reg_data.columns)):
			data = np.array([exp_level for exp_level in reg_data.iloc[:, col]])
			reg = reg.append(pd.DataFrame({"experiment": reg_data.columns[col], "reg_exp": [data]}))

		data_complete = pd.merge(promoters, target_expr_data, on="UID", how="inner").merge(reg, on="experiment", how="inner")
		data_complete.expression=data_complete.expression*data_complete.real # force dinucleotide shuffled sequences to have 0 expression. 

		# train_pct=80
		# val_pct=10
		# test_pct=10
		# mode = 'random'
		train, val, test = partition(data_complete, (train_pct,val_pct,test_pct), mode=mode)

		train_data=reformat(train,conv1d) if train_pct > 0 else []
		val_data=reformat(val,conv1d) if val_pct > 0 else []
		test_data=reformat(test,conv1d) if test_pct > 0 else []

	return [train_data,val_data,test_data]
Exemplo n.º 9
0
def main():
    args = parse_args()
    chrom_sizes = open(args.chrom_sizes, 'r').read().strip().split('\n')
    chrom_size_dict = {}
    for line in chrom_sizes:
        tokens = line.split('\t')
        chrom_size_dict[tokens[0]] = int(tokens[1])

    ref = pysam.FastaFile(args.ref_fasta)
    # load the model
    model = load_model_wrapper(args.model_hdf5)
    print("loaded model")
    # create the count & profile explainers
    model_wrapper = (model.input, model.outputs[1][:, 0:1])
    count_explainer = shap.DeepExplainer(
        model_wrapper,
        data=create_background_atac,
        combine_mult_and_diffref=combine_mult_and_diffref_atac)
    prof_explainer = create_explainer(model, ischip=False, task_index=0)
    print("made explainers")
    #read in the peaks
    peaks = pd.read_csv(args.peak_file, header=None, sep='\t')
    nrow = peaks.shape[0]
    tosample = round(int(args.npeaks_to_sample) / nrow, 2)
    peaks = peaks.sample(frac=tosample).reset_index(drop=True)
    nrow = peaks.shape[0]
    print("sampled peaks:" + str(nrow))

    #allocate space for numpy arrays for modisco
    hypothetical_profile_scores = np.empty((nrow, 2 * args.flank_size, 4))
    hypothetical_count_scores = np.empty((nrow, 2 * args.flank_size, 4))
    observed_profile_scores = np.empty((nrow, 2 * args.flank_size, 4))
    observed_count_scores = np.empty((nrow, 2 * args.flank_size, 4))
    seq = np.empty((nrow, 2 * args.flank_size, 4))
    print("pre-allocted output arrays")

    #generate one-hot-encoded inputs
    start_index = 0
    while start_index < nrow:
        cur_batch_size = min(args.batch_size, nrow - start_index)
        print(str(start_index) + ":" + str(start_index + cur_batch_size))

        batch_chroms = peaks[0][start_index:start_index +
                                cur_batch_size].tolist()
        batch_start_pos = peaks[1] + peaks[9] - args.seq_input_flank_size
        batch_start_pos = batch_start_pos.tolist()
        batch_start_pos = [max(0, i) for i in batch_start_pos]
        batch_start_pos = [
            min(
                batch_start_pos[i], chrom_size_dict[batch_chroms[i]] -
                2 * args.seq_input_flank_size) for i in range(cur_batch_size)
        ]
        seq_batch = [
            ref.fetch(batch_chroms[i], batch_start_pos[i],
                      batch_start_pos[i] + 2 * args.seq_input_flank_size)
            for i in range(cur_batch_size)
        ]
        if args.dinuc_shuffle_input is True:
            seq_batch = [dinuc_shuffle(i) for i in seq_batch]
        seq_batch = one_hot_encode(seq_batch)

        seq[start_index:start_index +
            cur_batch_size, :, :] = seq_batch[:, args.seq_input_flank_size -
                                              args.flank_size:args.
                                              seq_input_flank_size +
                                              args.flank_size, :]
        #get the hypothetical scores for the batch
        hypothetical_profile_scores[
            start_index:start_index + cur_batch_size, :, :] = prof_explainer(
                seq_batch, None)[:, args.seq_input_flank_size -
                                 args.flank_size:args.seq_input_flank_size +
                                 args.flank_size, :]
        observed_profile_scores[
            start_index:start_index +
            cur_batch_size, :, :] = hypothetical_profile_scores[
                start_index:start_index +
                cur_batch_size, :, :] * seq_batch[:,
                                                  args.seq_input_flank_size -
                                                  args.flank_size:args.
                                                  seq_input_flank_size +
                                                  args.flank_size, :]
        hypothetical_count_scores[start_index:start_index +
                                  cur_batch_size, :, :] = np.squeeze(
                                      count_explainer.shap_values(seq_batch)[0]
                                  )[:, args.seq_input_flank_size -
                                    args.flank_size:args.seq_input_flank_size +
                                    args.flank_size, :]
        observed_count_scores[
            start_index:start_index +
            cur_batch_size, :, :] = hypothetical_count_scores[
                start_index:start_index +
                cur_batch_size, :, :] * seq_batch[:,
                                                  args.seq_input_flank_size -
                                                  args.flank_size:args.
                                                  seq_input_flank_size +
                                                  args.flank_size, :]
        start_index += args.batch_size
    #save
    print("saving outputs")
    np.save(args.out_prefix + '.hyp.profile.npy', hypothetical_profile_scores)
    np.save(args.out_prefix + '.observed.profile.npy', observed_profile_scores)
    np.save(args.out_prefix + '.hyp.count.npy', hypothetical_count_scores)
    np.save(args.out_prefix + '.observed.count.npy', observed_count_scores)
    np.save(args.out_prefix + '.seq.npy', seq)
Exemplo n.º 10
0
prom_seq = []
prom_shuf = []
label = []
for genome in os.listdir('/nam-99/ablage/nam/peleke/snp_promoters'):
    genome_path = '/nam-99/ablage/nam/peleke/snp_promoters/' + genome
    ecotype_id = genome.split('.')[0]
    genome_key = 'X' + ecotype_id

    if genome_key in genomekeys_to_normcounts:
        for rec in SeqIO.parse(genome_path, 'fasta'):
            ID = rec.id.split(':')[1]
            seq = str(rec.seq)

            if ID == 'AT1G01140':
                prom_seq.append(seq)
                prom_shuf.append(dinuc_shuffle(seq, rng=np.random.RandomState(seed=42)))
                if norm_counts[genome_key][ID] == 0:
                    label.append(0)
                else:
                    label.append(1)


print('Encoding sequences')
encoded_seq = np.array([one_hot(prom) for prom in prom_seq])
encoded_seq = np.expand_dims(encoded_seq, 3)
encoded_shuf_seq = np.array([one_hot(prom) for prom in prom_shuf])
encoded_shuf_seq = np.expand_dims(encoded_shuf_seq, 3)
categories = np_utils.to_categorical(label, 2)

model = models.load_model('/nam-99/ablage/nam/peleke/Thesis_models/model2020-10-06073328.h5')
predictions = np.argmax(model.predict(encoded_seq), axis=1)
Exemplo n.º 11
0

def one_hot_encoder(seq):
    one_hot_encoding = np.zeros(shape=(4, len(seq)))
    for i, nt in enumerate(seq):
        one_hot_encoding[:, i] = codes[nt]
    return one_hot_encoding


encoded_proms = []
encoded_shuff_proms = []
label = [0 if x < 1 else 1 for x in exp_count]
classes = np_utils.to_categorical(label, num_classes=2)

for promoter in proms_all_genomes:
    shuf_prom = dinuc_shuffle(promoter)
    encoded_proms.append(one_hot_encoder(promoter))
    encoded_shuff_proms.append(one_hot_encoder(shuf_prom))

prepared_proms = np.expand_dims(np.array(encoded_proms, dtype=np.float32), axis=3)
prepared_shuff_proms = np.expand_dims(np.array(encoded_shuff_proms, dtype=np.float32), axis=3)
print(prepared_proms.shape)
print(prepared_shuff_proms.shape)
print(classes.shape)


model = models.load_model('/nam-99/ablage/nam/peleke/Models/model2020-07-30150217.h5')

predictions = np.argmax(model.predict(prepared_proms), axis=1)
print(predictions)
actual = np.argmax(classes, axis=1)
tensorflow.set_random_seed(42)

data = pd.read_csv('Data.csv')
data.set_index('Gene_id', inplace=True)

geneIDs = []
categories = []
seqs = []
dinuc_shuf_seqs = []

for prec, trec in zip(SeqIO.parse('promoter.fa', 'fasta'),
                      SeqIO.parse('terminators.fa', 'fasta')):
    ID = prec.id

    pseq = str(prec.seq)
    pshuff = dinuc_shuffle(pseq)

    tseq = str(trec.seq)
    tshuff = dinuc_shuffle(tseq)

    seq = pseq + tseq
    shuff_seq = pshuff + tshuff

    if ID in data.index:
        seqs.append(seq)
        dinuc_shuf_seqs.append(shuff_seq)
        geneIDs.append(ID)

        category = data['label'][ID]
        if category == 'expressed':
            categories.append(1)