def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing, max_spacing, num_pos, num_neg, GC_fraction): """ Simulates two classes of sequences with motif1 and motif2: - Positive class sequences with motif1 and motif2 positioned min_spacing and max_spacing - Negative class sequences with independent motif1 and motif2 positioned anywhere in the sequence, not as a heterodimer grammar Parameters ---------- seq_length : int, length of sequence GC_fraction : float, GC fraction in background sequence num_pos : int, number of positive class sequences num_neg : int, number of negatice class sequences motif1 : str, encode motif name motif2 : str, encode motif name min_spacing : int, minimum inter motif spacing max_spacing : int, maximum inter motif spacing Returns ------- sequence_arr : 1darray Array with sequence strings. y : 1darray Array with positive/negative class labels. embedding_arr: list List of embedding objects. """ import simdna from simdna import synthetic loaded_motifs = synthetic.LoadedEncodeMotifs( simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001) motif1_generator = synthetic.ReverseComplementWrapper( synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif1)) motif2_generator = synthetic.ReverseComplementWrapper( synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif2)) separation_generator = synthetic.UniformIntegerGenerator( min_spacing, max_spacing) embedder = synthetic.EmbeddableEmbedder( synthetic.PairEmbeddableGenerator(motif1_generator, motif2_generator, separation_generator)) embed_in_background = synthetic.EmbedInABackground( synthetic.ZeroOrderBackgroundGenerator( seq_length, discreteDistribution=get_distribution(GC_fraction)), [embedder]) generated_sequences = tuple( synthetic.GenerateSequenceNTimes(embed_in_background, num_pos).generateSequences()) grammar_sequence_arr = np.array( [generated_seq.seq for generated_seq in generated_sequences]) positive_embedding_arr = [ generated_seq.embeddings for generated_seq in generated_sequences ] nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding( [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction) sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr)) y = np.array([[True]] * num_pos + [[False]] * num_neg) embedding_arr = positive_embedding_arr + negative_embedding_arr return sequence_arr, y, embedding_arr
def test_simple_motif_grammar(self): seq_len = 100 min_sep = 2 max_sep = 6 random.seed(1234) np.random.seed(1234) num_sequences = 4000 loaded_motifs = sn.LoadedEncodeMotifs( simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001) motif1_generator = sn.PwmSamplerFromLoadedMotifs( loaded_motifs, "SIX5_known5") motif2_generator = sn.PwmSamplerFromLoadedMotifs( loaded_motifs, "ZNF143_known2") separation_generator = sn.UniformIntegerGenerator(min_sep,max_sep) embedder = sn.EmbeddableEmbedder( sn.PairEmbeddableGenerator( motif1_generator, motif2_generator, separation_generator)) embed_in_background = sn.EmbedInABackground( sn.ZeroOrderBackgroundGenerator(seq_len), [embedder]) generated_sequences = sn.GenerateSequenceNTimes( embed_in_background, num_sequences).generateSequences() generated_seqs = [seq for seq in generated_sequences] separations = defaultdict(lambda: 0) for seq in generated_seqs: assert len(seq.seq) == seq_len embedding1 = seq.embeddings[0] embedding2 = seq.embeddings[1] embedding3 = seq.embeddings[2] assert len(embedding1.what) == len(embedding1.what.string) assert len(embedding2.what) == len(embedding2.what.string) assert len(embedding3.what) == (len(embedding1.what)+ len(embedding2.what)+ embedding3.what.separation) #testing that the string of the first motif is placed correctly assert (seq.seq[ embedding1.startPos:embedding1.startPos+len(embedding1.what)] == embedding1.what.string) #testing that the string of the second motif is placed correctly assert (seq.seq[ embedding2.startPos:embedding2.startPos+len(embedding2.what)] == embedding2.what.string) #testing that the motifs are placed correctly assert ((embedding2.startPos - (embedding1.startPos + len(embedding1.what.string))) == embedding3.what.separation) #test separation is within the right limits assert embedding3.what.separation >= min_sep assert embedding3.what.separation <= max_sep #log the separation; will later test distribution separations[embedding3.what.separation] += 1 for possible_sep in range(min_sep, max_sep+1): np.testing.assert_almost_equal( separations[possible_sep]/float(num_sequences), 1.0/(max_sep-min_sep+1),2)
def test_multi_motif_embedding(self): motif_names = ["CTCF_known1", "IRF_known1", "SPI1_known1", "CTCF_known2", "CTCF_disc1"] loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001) position_generator = sn.UniformPositionGenerator() embedders = [sn.SubstringEmbedder(sn.PwmSamplerFromLoadedMotifs( loaded_motifs, motif_name), position_generator, name=motif_name) for motif_name in motif_names] min_selected_motifs = 1 max_selected_motifs = 4 quantity_generator = sn.UniformIntegerGenerator(min_selected_motifs, max_selected_motifs) combined_embedder = [sn.RandomSubsetOfEmbedders( quantity_generator, embedders)] embed_in_background = sn.EmbedInABackground( sn.ZeroOrderBackgroundGenerator( 300, discreteDistribution={'A':0.3, 'C':0.2, 'G':0.2, 'T':0.3}), combined_embedder) generated_sequences = tuple(sn.GenerateSequenceNTimes( embed_in_background, 8000).generateSequences()) sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences]) label_generator = sn.IsInTraceLabelGenerator(np.array(motif_names)) y = np.array([label_generator.generateLabels(generated_seq) for generated_seq in generated_sequences]).astype(bool) embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences] num_embeddings_count = defaultdict(lambda: 0) for seq, labels, embeddings, generated_seq in zip(sequence_arr, y, embedding_arr, generated_sequences): motifs_embedded = set() num_embeddings_count[len(embeddings)] += 1 for embedding in embeddings: #assert that the string selected is correct assert embedding.what.string ==\ seq[embedding.startPos: (embedding.startPos+len(embedding.what.string))] motifs_embedded.add(embedding.what.getDescription()) assert len(motifs_embedded) == len(embeddings) #non-redundant for (motif_idx, motif_name) in enumerate(motif_names): if motif_name in motifs_embedded: assert labels[motif_idx]==True else: assert labels[motif_idx]==False #assert that the num selected is drawn correctly from a uniform dist for num_selected_motifs in range(min_selected_motifs, max_selected_motifs+1): np.testing.assert_almost_equal( num_embeddings_count[num_selected_motifs]/float(len(sequence_arr)), 1.0/(max_selected_motifs-min_selected_motifs+1),2) #there also shouldn't be a preference for any one motif over others np.testing.assert_almost_equal( np.sum(y,axis=0)/float(np.sum(y)), 1.0/len(motif_names), 2)
def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction): """ Simulates sequences with a motif embedded anywhere in the sequence. Parameters ---------- motif_name : str encode motif name seq_length : int length of sequence num_seqs: int number of sequences GC_fraction : float GC basepair fraction in background sequence Returns ------- sequence_arr : 1darray Array with sequence strings. embedding_arr: 1darray Array of embedding objects. """ import simdna from simdna import synthetic if motif_name is None: embedders = [] else: loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001) substring_generator = synthetic.PwmSamplerFromLoadedMotifs( loaded_motifs, motif_name) embedders = [ synthetic.SubstringEmbedder( synthetic.ReverseComplementWrapper(substring_generator)) ] embed_in_background = synthetic.EmbedInABackground( synthetic.ZeroOrderBackgroundGenerator( seq_length, discreteDistribution=get_distribution(GC_fraction)), embedders) generated_sequences = tuple( synthetic.GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences()) sequence_arr = np.array( [generated_seq.seq for generated_seq in generated_sequences]) embedding_arr = [ generated_seq.embeddings for generated_seq in generated_sequences ] return sequence_arr, embedding_arr
def test_central_positions(self): pseudocount_prob = 0.001 pwm_name = "CTCF_known1" num_sequences = 10000 sequence_length = 50 loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=pseudocount_prob) substring_generator = sn.PwmSamplerFromLoadedMotifs( loaded_motifs, pwm_name) position_generator = sn.InsideCentralBp(30) embedders = [ sn.SubstringEmbedder(substring_generator, position_generator) ] embed_in_background = sn.EmbedInABackground( sn.ZeroOrderBackgroundGenerator(sequence_length, discreteDistribution={ 'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3 }), embedders) generated_sequences = list( sn.GenerateSequenceNTimes(embed_in_background, num_sequences).generateSequences()) motif_length = len(loaded_motifs.getPwm(pwm_name).getRows()) start_pos_count = np.zeros(sequence_length - motif_length + 1) for seq in generated_sequences: assert len(seq.seq) == sequence_length embeddings = seq.embeddings for embedding in embeddings: assert (embedding.what.string == seq.seq[embedding.startPos:embedding.startPos + len(embedding.what.string)]) start_pos_count[embedding.startPos] += 1 start_pos_count = start_pos_count / float(len(generated_sequences)) #the *1.0 is for conversion to float expected_start_pos_count = np.zeros_like(start_pos_count).astype( "float32") #expect motif to be embedded only in the central 40bp expected_start_pos_count[10:(40 - motif_length + 1)] = 1.0 / (30.0 - motif_length + 1) np.testing.assert_almost_equal(start_pos_count, expected_start_pos_count, 2)
def do(options): if (options.seed is not None): import numpy as np np.random.seed(options.seed) from simdna import random random.seed(options.seed) outputFileName_core = util.addArguments("DensityEmbedding", [util.ArgumentToAdd(options.prefix, "prefix"), util.ArrArgument(options.motifNames, "motifs"), util.ArgumentToAdd(options.min_motifs, "min"), util.ArgumentToAdd(options.max_motifs, "max"), util.ArgumentToAdd(options.mean_motifs, "mean"), util.FloatArgument(options.zero_prob, "zeroProb"), util.ArgumentToAdd(options.seqLength, "seqLength"), util.ArgumentToAdd(options.posSdevInBp, "posSdevInBp"), util.ArgumentToAdd(options.numSeqs, "numSeqs")]) loadedMotifs = synthetic.LoadedEncodeMotifs(options.pathToMotifs, pseudocountProb=0.001) embedInBackground = synthetic.EmbedInABackground( backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(seqLength=options.seqLength) , embedders=[ synthetic.RepeatedEmbedder( synthetic.SubstringEmbedder( synthetic.ReverseComplementWrapper( substringGenerator=synthetic.PwmSamplerFromLoadedMotifs( loadedMotifs=loadedMotifs,motifName=motifName), reverseComplementProb=0.5 ), positionGenerator= synthetic.positiongen.NormalDistributionPositionGenerator( stdInBp=options.posSdevInBp)), quantityGenerator=synthetic.ZeroInflater(synthetic.MinMaxWrapper( synthetic.PoissonQuantityGenerator(options.mean_motifs), theMax=options.max_motifs, theMin=options.min_motifs), zeroProb=options.zero_prob) ) for motifName in options.motifNames ] ) sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground, options.numSeqs) synthetic.printSequences(outputFileName_core+".simdata", sequenceSet, includeFasta=True, includeEmbeddings=True, prefix=options.prefix)
def motif_density(motif_name, seq_length, num_seqs, min_counts, max_counts, GC_fraction, central_bp=None): """ Returns sequences with motif density, along with embeddings array. """ import simdna from simdna import synthetic loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001) substring_generator = synthetic.PwmSamplerFromLoadedMotifs( loaded_motifs, motif_name) if central_bp is not None: position_generator = synthetic.InsideCentralBp(central_bp) else: position_generator = synthetic.UniformPositionGenerator() quantity_generator = synthetic.UniformIntegerGenerator( min_counts, max_counts) embedders = [ synthetic.RepeatedEmbedder( synthetic.SubstringEmbedder( synthetic.ReverseComplementWrapper(substring_generator), position_generator), quantity_generator) ] embed_in_background = synthetic.EmbedInABackground( synthetic.ZeroOrderBackgroundGenerator( seq_length, discreteDistribution=get_distribution(GC_fraction)), embedders) generated_sequences = tuple( synthetic.GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences()) sequence_arr = np.array( [generated_seq.seq for generated_seq in generated_sequences]) embedding_arr = [ generated_seq.embeddings for generated_seq in generated_sequences ] return sequence_arr, embedding_arr
def test_uniform_positions(self): pseudocount_prob = 0.001 pwm_name = "CTCF_known1" num_sequences = 10000 sequence_length = 50 loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=pseudocount_prob) substring_generator = sn.PwmSamplerFromLoadedMotifs( loaded_motifs, pwm_name) position_generator = sn.UniformPositionGenerator() embedders = [ sn.SubstringEmbedder(substring_generator, position_generator) ] embed_in_background = sn.EmbedInABackground( sn.ZeroOrderBackgroundGenerator(sequence_length, discreteDistribution={ 'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3 }), embedders) generated_sequences = list( sn.GenerateSequenceNTimes(embed_in_background, num_sequences).generateSequences()) motif_length = len(loaded_motifs.getPwm(pwm_name).getRows()) start_pos_count = np.zeros(sequence_length - motif_length + 1) for seq in generated_sequences: assert len(seq.seq) == sequence_length embeddings = seq.embeddings for embedding in embeddings: assert (embedding.what.string == seq.seq[embedding.startPos:embedding.startPos + len(embedding.what.string)]) start_pos_count[embedding.startPos] += 1 start_pos_count = start_pos_count / float(len(generated_sequences)) np.testing.assert_almost_equal(start_pos_count, 1.0 / len(start_pos_count), 2)
def test_density_motif_embedding(self): random.seed(1234) np.random.seed(1234) min_counts = 2 max_counts = 5 pseudocount_prob = 0.001 pwm_name = "CTCF_known1" num_sequences = 5000 loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=pseudocount_prob) substring_generator = sn.PwmSamplerFromLoadedMotifs( loaded_motifs, pwm_name) position_generator = sn.UniformPositionGenerator() quantity_generator = sn.UniformIntegerGenerator(min_counts, max_counts) embedders = [ sn.RepeatedEmbedder( sn.SubstringEmbedder( sn.ReverseComplementWrapper( substring_generator), position_generator), quantity_generator)] embed_in_background = sn.EmbedInABackground( sn.ZeroOrderBackgroundGenerator( 500, discreteDistribution={'A':0.3,'C':0.2, 'G':0.2,'T':0.3}), embedders) generated_sequences = list(sn.GenerateSequenceNTimes( embed_in_background, num_sequences).generateSequences()) assert len(generated_sequences) == num_sequences actual_pwm = np.array([[0.095290, 0.318729, 0.083242, 0.502738], [0.182913, 0.158817, 0.453450, 0.204819], [0.307777, 0.053669, 0.491785, 0.146769], [0.061336, 0.876232, 0.023001, 0.039430], [0.008762, 0.989047, 0.000000, 0.002191], [0.814896, 0.014239, 0.071194, 0.099671], [0.043812, 0.578313, 0.365827, 0.012048], [0.117325, 0.474781, 0.052632, 0.355263], [0.933114, 0.012061, 0.035088, 0.019737], [0.005488, 0.000000, 0.991218, 0.003293], [0.365532, 0.003293, 0.621295, 0.009879], [0.059276, 0.013172, 0.553238, 0.374314], [0.013187, 0.000000, 0.978022, 0.008791], [0.061538, 0.008791, 0.851648, 0.078022], [0.114411, 0.806381, 0.005501, 0.073707], [0.409241, 0.014301, 0.557756, 0.018702], [0.090308, 0.530837, 0.338106, 0.040749], [0.128855, 0.354626, 0.080396, 0.436123], [0.442731, 0.199339, 0.292952, 0.064978]]) actual_pwm = actual_pwm*(1-pseudocount_prob) + pseudocount_prob/4 np.testing.assert_almost_equal(np.sum(actual_pwm,axis=-1),1.0,6) np.testing.assert_almost_equal( actual_pwm, np.array(loaded_motifs.getPwm(pwm_name).getRows())) letter_to_index = {'A':0, 'C':1, 'G':2, 'T':3} reconstructed_pwm_fwd = np.zeros_like(actual_pwm) reconstructed_pwm_rev = np.zeros_like(actual_pwm) quantity_distribution = defaultdict(lambda: 0) total_fwd_embeddings = 0.0 total_rev_embeddings = 0.0 for seq in generated_sequences: embeddings = seq.embeddings quantity_distribution[len(embeddings)] += 1 for embedding in embeddings: assert (embedding.what.string ==seq.seq[embedding.startPos: embedding.startPos+len(embedding.what.string)]) if ('revComp' in embedding.what.getDescription()): total_rev_embeddings += 1 else: total_fwd_embeddings += 1 for char_idx, char in enumerate(embedding.what.string): if ('revComp' in embedding.what.getDescription()): arr = reconstructed_pwm_rev else: arr = reconstructed_pwm_fwd arr[char_idx][letter_to_index[char]] += 1 total_embeddings = total_fwd_embeddings + total_rev_embeddings np.testing.assert_almost_equal( total_fwd_embeddings/total_embeddings, 0.5, 2) #normalize each column of reconstructed_pwm reconstructed_pwm_fwd = reconstructed_pwm_fwd/total_fwd_embeddings reconstructed_pwm_rev = reconstructed_pwm_rev/total_rev_embeddings np.testing.assert_almost_equal(actual_pwm, reconstructed_pwm_fwd, 2) np.testing.assert_almost_equal(actual_pwm, reconstructed_pwm_rev[::-1,::-1], 2) #test the quantities of motifs were sampled uniformly for quantity in range(min_counts, max_counts+1): np.testing.assert_almost_equal( quantity_distribution[quantity]/float(num_sequences), 1.0/(max_counts-min_counts+1),2)
def get_embedder(motif_name): substring_generator = synthetic.PwmSamplerFromLoadedMotifs( loaded_motifs, motif_name) return synthetic.SubstringEmbedder( synthetic.ReverseComplementWrapper(substring_generator), name=motif_name)