Exemplo n.º 1
0
def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing,
                                 max_spacing, num_pos, num_neg, GC_fraction):
  """
    Simulates two classes of sequences with motif1 and motif2:
        - Positive class sequences with motif1 and motif2 positioned
          min_spacing and max_spacing
        - Negative class sequences with independent motif1 and motif2 positioned
        anywhere in the sequence, not as a heterodimer grammar

    Parameters
    ----------
    seq_length : int, length of sequence
    GC_fraction : float, GC fraction in background sequence
    num_pos : int, number of positive class sequences
    num_neg : int, number of negatice class sequences
    motif1 : str, encode motif name
    motif2 : str, encode motif name
    min_spacing : int, minimum inter motif spacing
    max_spacing : int, maximum inter motif spacing

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: list
        List of embedding objects.
    """
  import simdna
  from simdna import synthetic
  loaded_motifs = synthetic.LoadedEncodeMotifs(
      simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
  motif1_generator = synthetic.ReverseComplementWrapper(
      synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
  motif2_generator = synthetic.ReverseComplementWrapper(
      synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
  separation_generator = synthetic.UniformIntegerGenerator(
      min_spacing, max_spacing)
  embedder = synthetic.EmbeddableEmbedder(
      synthetic.PairEmbeddableGenerator(motif1_generator, motif2_generator,
                                        separation_generator))
  embed_in_background = synthetic.EmbedInABackground(
      synthetic.ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      [embedder])
  generated_sequences = tuple(
      synthetic.GenerateSequenceNTimes(embed_in_background,
                                       num_pos).generateSequences())
  grammar_sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  positive_embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
  sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr
Exemplo n.º 2
0
    def test_run(self):
        dnaseSimulationFileName = "temp_dnaseSimulationFile.txt"
        dnaseSimFh = fp.getFileHandle(dnaseSimulationFileName, 'w')
        dnaseSimFh.write("sequenceName\tsequence\tmotifs\n")
        dnaseSimFh.write(
            "seq1\tACGTgaTATGATAGCACATGTCGTCAGTACCATGGTCGCCGCTTGCATAGGCAAACATAATTGG\tGATA4_HUMAN.H10MO.B-10,TAL1_known1-30,GATA4_HUMAN.H10MO.B-60\n"
        )
        dnaseSimFh.write(
            "seq2\tACGTGAtaTGATAGCACATGTCGTCAGTACCATGGTCGCCGCTTGCATAGGCAAACATAATTGG\tGATA4_HUMAN.H10MO.B-5,TAL1_known1-35\n"
        )
        dnaseSimFh.write(
            "seq3\tACGTGAtaTGATAGCACATGTCGTCAGTACCATGGTCGCCGCTTGCATAGGCAAACATAATTGG\t"
            + "GATA_disc1-5,GATA_known1-5,TAL1_known1-5," +
            "GATA_disc1-55,GATA_known1-55,TAL1_known1-55\n"
        )  #last TAL1 won't get embedded
        dnaseSimFh.write(
            "seq4\tACGTGAtaTGATAGCACATGTCGTCAGTACCATGGTCGCCGCTTGCATAGGCAAACATAATTGG\t"
            + "GATA_disc1-30,GATA_known1-30,TAL1_known1-30,TAL1_known1-30\n")
        dnaseSimFh.close()

        dnaseSimulation = sn.DnaseSimulation(
            dnaseSimulationFile=dnaseSimulationFileName,
            loadedMotifs=sn.LoadedEncodeMotifs(
                simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001).addMotifs(
                    sn.LoadedHomerMotifs(simdna.HOCOMOCO_MOTIFS_PATH,
                                         pseudocountProb=0.000)),
            shuffler=sn.DinucleotideShuffler())
        sn.printSequences("temp_dnaseSimulation.simdata",
                          dnaseSimulation,
                          includeFasta=False,
                          includeEmbeddings=True,
                          prefix=None)
Exemplo n.º 3
0
    def test_simple_motif_grammar(self):
        seq_len = 100
        min_sep = 2
        max_sep = 6
        random.seed(1234)
        np.random.seed(1234)
        num_sequences = 4000
        loaded_motifs = sn.LoadedEncodeMotifs(
                         simdna.ENCODE_MOTIFS_PATH,
                         pseudocountProb=0.001)
        motif1_generator = sn.PwmSamplerFromLoadedMotifs(
                            loaded_motifs, "SIX5_known5")
        motif2_generator = sn.PwmSamplerFromLoadedMotifs(
                            loaded_motifs, "ZNF143_known2")
        separation_generator = sn.UniformIntegerGenerator(min_sep,max_sep)
        embedder = sn.EmbeddableEmbedder(
                    sn.PairEmbeddableGenerator(
                     motif1_generator, motif2_generator, separation_generator))
        embed_in_background = sn.EmbedInABackground(
                               sn.ZeroOrderBackgroundGenerator(seq_len),
                               [embedder])
        generated_sequences = sn.GenerateSequenceNTimes(
                        embed_in_background, num_sequences).generateSequences()
        generated_seqs = [seq for seq in generated_sequences]
        separations = defaultdict(lambda: 0) 
        for seq in generated_seqs:
            assert len(seq.seq) == seq_len
            embedding1 = seq.embeddings[0]
            embedding2 = seq.embeddings[1]
            embedding3 = seq.embeddings[2]
            assert len(embedding1.what) == len(embedding1.what.string)
            assert len(embedding2.what) == len(embedding2.what.string)
            assert len(embedding3.what) == (len(embedding1.what)+
                                            len(embedding2.what)+
                                            embedding3.what.separation)
            #testing that the string of the first motif is placed correctly
            assert (seq.seq[
             embedding1.startPos:embedding1.startPos+len(embedding1.what)]
             == embedding1.what.string)
            #testing that the string of the second motif is placed correctly
            assert (seq.seq[
             embedding2.startPos:embedding2.startPos+len(embedding2.what)]
             == embedding2.what.string) 
            #testing that the motifs are placed correctly
            assert ((embedding2.startPos - (embedding1.startPos
                                          + len(embedding1.what.string)))
                     == embedding3.what.separation)
            #test separation is within the right limits 
            assert embedding3.what.separation >= min_sep 
            assert embedding3.what.separation <= max_sep
            #log the separation; will later test distribution
            separations[embedding3.what.separation] += 1

        for possible_sep in range(min_sep, max_sep+1):
            np.testing.assert_almost_equal(
             separations[possible_sep]/float(num_sequences),
             1.0/(max_sep-min_sep+1),2)
             
Exemplo n.º 4
0
    def test_multi_motif_embedding(self):
   
        motif_names = ["CTCF_known1", "IRF_known1",
                       "SPI1_known1", "CTCF_known2", "CTCF_disc1"] 
        loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                   pseudocountProb=0.001)
        position_generator = sn.UniformPositionGenerator()
        embedders = [sn.SubstringEmbedder(sn.PwmSamplerFromLoadedMotifs(
                     loaded_motifs, motif_name),
                     position_generator, name=motif_name)
                     for motif_name in motif_names]
        min_selected_motifs = 1
        max_selected_motifs = 4
        quantity_generator = sn.UniformIntegerGenerator(min_selected_motifs,
                                                     max_selected_motifs)
        combined_embedder = [sn.RandomSubsetOfEmbedders(
                             quantity_generator, embedders)]
        embed_in_background = sn.EmbedInABackground(
            sn.ZeroOrderBackgroundGenerator(
             300, discreteDistribution={'A':0.3, 'C':0.2, 'G':0.2, 'T':0.3}),
            combined_embedder)
        generated_sequences = tuple(sn.GenerateSequenceNTimes(
            embed_in_background, 8000).generateSequences())
        sequence_arr = np.array([generated_seq.seq for
                                 generated_seq in generated_sequences])
        label_generator = sn.IsInTraceLabelGenerator(np.array(motif_names))
        y = np.array([label_generator.generateLabels(generated_seq)
                      for generated_seq in generated_sequences]).astype(bool)
        embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences]

        num_embeddings_count = defaultdict(lambda: 0)
        for seq, labels, embeddings, generated_seq in zip(sequence_arr, y, embedding_arr, generated_sequences):
            motifs_embedded = set()
            num_embeddings_count[len(embeddings)] += 1
            for embedding in embeddings:
                #assert that the string selected is correct
                assert embedding.what.string ==\
                        seq[embedding.startPos:
                            (embedding.startPos+len(embedding.what.string))]
                motifs_embedded.add(embedding.what.getDescription()) 
            assert len(motifs_embedded) == len(embeddings) #non-redundant
            for (motif_idx, motif_name) in enumerate(motif_names):
                if motif_name in motifs_embedded:
                    assert labels[motif_idx]==True
                else:
                    assert labels[motif_idx]==False
        
        #assert that the num selected is drawn correctly from a uniform dist
        for num_selected_motifs in range(min_selected_motifs,
                                         max_selected_motifs+1): 
            np.testing.assert_almost_equal(
        num_embeddings_count[num_selected_motifs]/float(len(sequence_arr)),
        1.0/(max_selected_motifs-min_selected_motifs+1),2)
        #there also shouldn't be a preference for any one motif over others
        np.testing.assert_almost_equal(
            np.sum(y,axis=0)/float(np.sum(y)), 1.0/len(motif_names), 2)
Exemplo n.º 5
0
def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4):
    """Computes pwm log odds.

  Parameters
  ----------
  encoded_sequences : 4darray
       (N_sequences, N_letters, sequence_length, 1) array
  motif_names : list of strings
  max_scores : int, optional
  return_positions : boolean, optional
  GC_fraction : float, optional

  Returns
  -------
  (N_sequences, num_motifs, seq_length) complete score array by default.
  If max_scores, (N_sequences, num_motifs*max_scores) max score array.
  If max_scores and return_positions, (N_sequences, 2*num_motifs*max_scores)
  array with max scores and their positions.
  """
    import simdna
    from simdna import synthetic
    loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                                 pseudocountProb=0.001)
    num_samples, _, seq_length, _ = encoded_sequences.shape
    scores = np.ones((num_samples, len(motif_names), seq_length))
    for j, motif_name in enumerate(motif_names):
        pwm = loaded_motifs.getPwm(motif_name).getRows().T
        log_pwm = np.log(pwm)
        gc_pwm = 0.5 * np.array(
            [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] *
            len(pwm[0])).T
        gc_log_pwm = np.log(gc_pwm)
        log_scores = get_pssm_scores(encoded_sequences, log_pwm)
        gc_log_scores = get_pssm_scores(encoded_sequences, gc_log_pwm)
        scores[:, j, :] = log_scores - gc_log_scores
    if max_scores is not None:
        sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
        if return_positions:
            sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
            return np.concatenate(
                (sorted_scores.reshape(
                    (num_samples, len(motif_names) * max_scores)),
                 sorted_positions.reshape(
                     (num_samples, len(motif_names) * max_scores))),
                axis=1)
        else:
            return sorted_scores.reshape(
                (num_samples, len(motif_names) * max_scores))
    else:
        return scores
Exemplo n.º 6
0
def do(options):
    if (options.seed is not None):
        import numpy as np
        np.random.seed(options.seed)
        import random
        random.seed(options.seed)

    outputFileName_core = util.addArguments("DensityEmbedding", [
        util.ArgumentToAdd(options.prefix, "prefix"),
        util.BooleanArgument(options.bestHit, "bestHit"),
        util.ArrArgument(options.motifNames, "motifs"),
        util.ArgumentToAdd(options.min_motifs, "min"),
        util.ArgumentToAdd(options.max_motifs, "max"),
        util.ArgumentToAdd(options.mean_motifs, "mean"),
        util.FloatArgument(options.zero_prob, "zeroProb"),
        util.ArgumentToAdd(options.seqLength, "seqLength"),
        util.ArgumentToAdd(options.numSeqs, "numSeqs")
    ])

    loadedMotifs = synthetic.LoadedEncodeMotifs(options.pathToMotifs,
                                                pseudocountProb=0.001)
    Constructor = synthetic.BestHitPwmFromLoadedMotifs if options.bestHit else synthetic.PwmSamplerFromLoadedMotifs
    embedInBackground = synthetic.EmbedInABackground(
        backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(
            seqLength=options.seqLength),
        embedders=[
            synthetic.RepeatedEmbedder(
                synthetic.SubstringEmbedder(
                    synthetic.ReverseComplementWrapper(
                        substringGenerator=Constructor(
                            loadedMotifs=loadedMotifs, motifName=motifName),
                        reverseComplementProb=options.rc_prob),
                    positionGenerator=synthetic.UniformPositionGenerator()),
                quantityGenerator=synthetic.ZeroInflater(
                    synthetic.MinMaxWrapper(synthetic.PoissonQuantityGenerator(
                        options.mean_motifs),
                                            theMax=options.max_motifs,
                                            theMin=options.min_motifs),
                    zeroProb=options.zero_prob))
            for motifName in options.motifNames
        ])
    sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground,
                                                   options.numSeqs)
    synthetic.printSequences(outputFileName_core + ".simdata",
                             sequenceSet,
                             includeFasta=True,
                             includeEmbeddings=True,
                             prefix=options.prefix)
Exemplo n.º 7
0
def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction):
    """
  Simulates sequences with a motif embedded anywhere in the sequence.

  Parameters
  ----------
  motif_name : str
      encode motif name
  seq_length : int
      length of sequence
  num_seqs: int
      number of sequences
  GC_fraction : float
      GC basepair fraction in background sequence

  Returns
  -------
  sequence_arr : 1darray
      Array with sequence strings.
  embedding_arr: 1darray
      Array of embedding objects.
  """
    import simdna
    from simdna import synthetic
    if motif_name is None:
        embedders = []
    else:
        loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                                     pseudocountProb=0.001)
        substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
            loaded_motifs, motif_name)
        embedders = [
            synthetic.SubstringEmbedder(
                synthetic.ReverseComplementWrapper(substring_generator))
        ]
    embed_in_background = synthetic.EmbedInABackground(
        synthetic.ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        embedders)
    generated_sequences = tuple(
        synthetic.GenerateSequenceNTimes(embed_in_background,
                                         num_seqs).generateSequences())
    sequence_arr = np.array(
        [generated_seq.seq for generated_seq in generated_sequences])
    embedding_arr = [
        generated_seq.embeddings for generated_seq in generated_sequences
    ]
    return sequence_arr, embedding_arr
    def test_central_positions(self):
        pseudocount_prob = 0.001
        pwm_name = "CTCF_known1"
        num_sequences = 10000
        sequence_length = 50
        loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                              pseudocountProb=pseudocount_prob)
        substring_generator = sn.PwmSamplerFromLoadedMotifs(
            loaded_motifs, pwm_name)
        position_generator = sn.InsideCentralBp(30)
        embedders = [
            sn.SubstringEmbedder(substring_generator, position_generator)
        ]
        embed_in_background = sn.EmbedInABackground(
            sn.ZeroOrderBackgroundGenerator(sequence_length,
                                            discreteDistribution={
                                                'A': 0.3,
                                                'C': 0.2,
                                                'G': 0.2,
                                                'T': 0.3
                                            }), embedders)
        generated_sequences = list(
            sn.GenerateSequenceNTimes(embed_in_background,
                                      num_sequences).generateSequences())

        motif_length = len(loaded_motifs.getPwm(pwm_name).getRows())
        start_pos_count = np.zeros(sequence_length - motif_length + 1)

        for seq in generated_sequences:
            assert len(seq.seq) == sequence_length
            embeddings = seq.embeddings
            for embedding in embeddings:
                assert (embedding.what.string ==
                        seq.seq[embedding.startPos:embedding.startPos +
                                len(embedding.what.string)])
                start_pos_count[embedding.startPos] += 1

        start_pos_count = start_pos_count / float(len(generated_sequences))
        #the *1.0 is for conversion to float
        expected_start_pos_count = np.zeros_like(start_pos_count).astype(
            "float32")
        #expect motif to be embedded only in the central 40bp
        expected_start_pos_count[10:(40 - motif_length +
                                     1)] = 1.0 / (30.0 - motif_length + 1)
        np.testing.assert_almost_equal(start_pos_count,
                                       expected_start_pos_count, 2)
def variableSpacingGrammar(options):
    pc = 0.001
    pathToMotifs = options.pathToMotifs
    loadedMotifs = synthetic.LoadedEncodeMotifs(pathToMotifs, pseudocountProb=pc)
    motifName1 = options.motifName1
    motifName2 = options.motifName2
    seqLength = options.seqLength
    numSeq = options.numSeq
    outputFileName = ("variableSpacingGrammarSimulation_"
                      +"prefix-"+options.prefix
                      +"_motif1-"+motifName1+"_motif2-"+motifName2
                      +"_seqLength"+str(seqLength)+"_numSeq"
                      +str(numSeq)+".simdata")

    kwargs={'loadedMotifs':loadedMotifs}
    theClass=synthetic.PwmSamplerFromLoadedMotifs
    motif1Generator=theClass(motifName=motifName1,**kwargs)
    motif2Generator=theClass(motifName=motifName2,**kwargs)
    motif1Embedder=synthetic.SubstringEmbedder(substringGenerator=motif1Generator)
    motif2Embedder=synthetic.SubstringEmbedder(substringGenerator=motif2Generator)

    embedders = []
    separationGenerator=synthetic.MinMaxWrapper(
        synthetic.PoissonQuantityGenerator(options.meanSpacing),
        theMin=options.minSpacing,
        theMax=options.maxSpacing) 
    embedders.append(synthetic.EmbeddableEmbedder(
                        embeddableGenerator=synthetic.PairEmbeddableGenerator(
                            embeddableGenerator1=motif1Generator
                            ,embeddableGenerator2=motif2Generator
                            ,separationGenerator=separationGenerator
                        )
                    ))

    embedInBackground = synthetic.EmbedInABackground(
        backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(seqLength) 
        , embedders=embedders
    )

    sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground, numSeq)
    synthetic.printSequences(outputFileName, sequenceSet,
                             includeFasta=True, includeEmbeddings=True,
                             prefix=options.prefix)
Exemplo n.º 10
0
def motif_density(motif_name,
                  seq_length,
                  num_seqs,
                  min_counts,
                  max_counts,
                  GC_fraction,
                  central_bp=None):
    """
  Returns sequences with motif density, along with embeddings array.
  """
    import simdna
    from simdna import synthetic
    loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                                 pseudocountProb=0.001)
    substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
        loaded_motifs, motif_name)
    if central_bp is not None:
        position_generator = synthetic.InsideCentralBp(central_bp)
    else:
        position_generator = synthetic.UniformPositionGenerator()
    quantity_generator = synthetic.UniformIntegerGenerator(
        min_counts, max_counts)
    embedders = [
        synthetic.RepeatedEmbedder(
            synthetic.SubstringEmbedder(
                synthetic.ReverseComplementWrapper(substring_generator),
                position_generator), quantity_generator)
    ]
    embed_in_background = synthetic.EmbedInABackground(
        synthetic.ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        embedders)
    generated_sequences = tuple(
        synthetic.GenerateSequenceNTimes(embed_in_background,
                                         num_seqs).generateSequences())
    sequence_arr = np.array(
        [generated_seq.seq for generated_seq in generated_sequences])
    embedding_arr = [
        generated_seq.embeddings for generated_seq in generated_sequences
    ]
    return sequence_arr, embedding_arr
Exemplo n.º 11
0
    def test_uniform_positions(self):
        pseudocount_prob = 0.001
        pwm_name = "CTCF_known1"
        num_sequences = 10000
        sequence_length = 50
        loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                              pseudocountProb=pseudocount_prob)
        substring_generator = sn.PwmSamplerFromLoadedMotifs(
            loaded_motifs, pwm_name)
        position_generator = sn.UniformPositionGenerator()
        embedders = [
            sn.SubstringEmbedder(substring_generator, position_generator)
        ]
        embed_in_background = sn.EmbedInABackground(
            sn.ZeroOrderBackgroundGenerator(sequence_length,
                                            discreteDistribution={
                                                'A': 0.3,
                                                'C': 0.2,
                                                'G': 0.2,
                                                'T': 0.3
                                            }), embedders)
        generated_sequences = list(
            sn.GenerateSequenceNTimes(embed_in_background,
                                      num_sequences).generateSequences())

        motif_length = len(loaded_motifs.getPwm(pwm_name).getRows())
        start_pos_count = np.zeros(sequence_length - motif_length + 1)

        for seq in generated_sequences:
            assert len(seq.seq) == sequence_length
            embeddings = seq.embeddings
            for embedding in embeddings:
                assert (embedding.what.string ==
                        seq.seq[embedding.startPos:embedding.startPos +
                                len(embedding.what.string)])
                start_pos_count[embedding.startPos] += 1

        start_pos_count = start_pos_count / float(len(generated_sequences))
        np.testing.assert_almost_equal(start_pos_count,
                                       1.0 / len(start_pos_count), 2)
    def test_density_motif_embedding(self):
        random.seed(1234)
        np.random.seed(1234)
        min_counts = 2
        max_counts = 5
        pseudocount_prob = 0.001
        pwm_name = "CTCF_known1"
        num_sequences = 5000
        loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                   pseudocountProb=pseudocount_prob)
        substring_generator = sn.PwmSamplerFromLoadedMotifs(
            loaded_motifs, pwm_name)
        position_generator = sn.UniformPositionGenerator()
        quantity_generator = sn.UniformIntegerGenerator(min_counts, max_counts)
        embedders = [
            sn.RepeatedEmbedder(
                sn.SubstringEmbedder(
                    sn.ReverseComplementWrapper(
                        substring_generator), position_generator),
                quantity_generator)]
        embed_in_background = sn.EmbedInABackground(
            sn.ZeroOrderBackgroundGenerator(
                500, discreteDistribution={'A':0.3,'C':0.2,
                                                  'G':0.2,'T':0.3}),
            embedders)
        generated_sequences = list(sn.GenerateSequenceNTimes(
            embed_in_background, num_sequences).generateSequences())
        assert len(generated_sequences) == num_sequences

        actual_pwm = np.array([[0.095290, 0.318729, 0.083242, 0.502738],
                         [0.182913, 0.158817, 0.453450, 0.204819],
                         [0.307777, 0.053669, 0.491785, 0.146769],
                         [0.061336, 0.876232, 0.023001, 0.039430],
                         [0.008762, 0.989047, 0.000000, 0.002191],
                         [0.814896, 0.014239, 0.071194, 0.099671],
                         [0.043812, 0.578313, 0.365827, 0.012048],
                         [0.117325, 0.474781, 0.052632, 0.355263],
                         [0.933114, 0.012061, 0.035088, 0.019737],
                         [0.005488, 0.000000, 0.991218, 0.003293],
                         [0.365532, 0.003293, 0.621295, 0.009879],
                         [0.059276, 0.013172, 0.553238, 0.374314],
                         [0.013187, 0.000000, 0.978022, 0.008791],
                         [0.061538, 0.008791, 0.851648, 0.078022],
                         [0.114411, 0.806381, 0.005501, 0.073707],
                         [0.409241, 0.014301, 0.557756, 0.018702],
                         [0.090308, 0.530837, 0.338106, 0.040749],
                         [0.128855, 0.354626, 0.080396, 0.436123],
                         [0.442731, 0.199339, 0.292952, 0.064978]])

        actual_pwm = actual_pwm*(1-pseudocount_prob) + pseudocount_prob/4
        np.testing.assert_almost_equal(np.sum(actual_pwm,axis=-1),1.0,6)
        np.testing.assert_almost_equal(
            actual_pwm,
            np.array(loaded_motifs.getPwm(pwm_name).getRows())) 
        letter_to_index = {'A':0, 'C':1, 'G':2, 'T':3}
        reconstructed_pwm_fwd = np.zeros_like(actual_pwm)
        reconstructed_pwm_rev = np.zeros_like(actual_pwm)
        quantity_distribution = defaultdict(lambda: 0) 
        total_fwd_embeddings = 0.0
        total_rev_embeddings = 0.0
        
        for seq in generated_sequences:
            embeddings = seq.embeddings
            quantity_distribution[len(embeddings)] += 1
            for embedding in embeddings:
                assert (embedding.what.string
                 ==seq.seq[embedding.startPos:
                       embedding.startPos+len(embedding.what.string)])
                if ('revComp' in embedding.what.getDescription()):
                    total_rev_embeddings += 1
                else:
                    total_fwd_embeddings += 1
                for char_idx, char in enumerate(embedding.what.string):
                    if ('revComp' in embedding.what.getDescription()):
                        arr = reconstructed_pwm_rev
                    else:
                        arr = reconstructed_pwm_fwd 
                    arr[char_idx][letter_to_index[char]] += 1

        total_embeddings = total_fwd_embeddings + total_rev_embeddings 
        np.testing.assert_almost_equal(
            total_fwd_embeddings/total_embeddings, 0.5, 2) 

        #normalize each column of reconstructed_pwm
        reconstructed_pwm_fwd = reconstructed_pwm_fwd/total_fwd_embeddings 
        reconstructed_pwm_rev = reconstructed_pwm_rev/total_rev_embeddings 
        np.testing.assert_almost_equal(actual_pwm, reconstructed_pwm_fwd, 2)
        np.testing.assert_almost_equal(actual_pwm,
                                       reconstructed_pwm_rev[::-1,::-1], 2)
       
        #test the quantities of motifs were sampled uniformly  
        for quantity in range(min_counts, max_counts+1):
            np.testing.assert_almost_equal(
             quantity_distribution[quantity]/float(num_sequences),
             1.0/(max_counts-min_counts+1),2)
Exemplo n.º 13
0
def get_motif_scores(encoded_sequences: np.ndarray,
                     motif_names: List[str],
                     max_scores: Optional[int] = None,
                     return_positions: bool = False,
                     GC_fraction: float = 0.4) -> np.ndarray:
    """Computes pwm log odds.

  Parameters
  ----------
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`.
  motif_names: List[str]
    List of motif file names.
  max_scores: int, optional
    Get top `max_scores` scores.
  return_positions: bool, default False
    Whether to return postions or not.
  GC_fraction: float, default 0.4
    GC fraction in background sequence.

  Returns
  -------
  np.ndarray
    A numpy array of complete score. The shape is `(N_sequences, num_motifs, seq_length)` by default.
    If max_scores, the shape of score array is `(N_sequences, num_motifs*max_scores)`.
    If max_scores and return_positions, the shape of score array with max scores and their positions.
    is `(N_sequences, 2*num_motifs*max_scores)`.

  Notes
  -----
  This method requires simdna to be installed.
  """
    try:
        import simdna
        from simdna import synthetic
    except ModuleNotFoundError:
        raise ValueError("This function requires simdna to be installed.")

    loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                                 pseudocountProb=0.001)
    num_samples, _, seq_length, _ = encoded_sequences.shape
    scores = np.ones((num_samples, len(motif_names), seq_length))
    for j, motif_name in enumerate(motif_names):
        pwm = loaded_motifs.getPwm(motif_name).getRows().T
        log_pwm = np.log(pwm)
        gc_pwm = 0.5 * np.array(
            [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] *
            len(pwm[0])).T
        gc_log_pwm = np.log(gc_pwm)
        log_scores = get_pssm_scores(encoded_sequences, log_pwm)
        gc_log_scores = get_pssm_scores(encoded_sequences, gc_log_pwm)
        scores[:, j, :] = log_scores - gc_log_scores
    if max_scores is not None:
        sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
        if return_positions:
            sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
            return np.concatenate(
                (sorted_scores.reshape(
                    (num_samples, len(motif_names) * max_scores)),
                 sorted_positions.reshape(
                     (num_samples, len(motif_names) * max_scores))),
                axis=1)
        else:
            return sorted_scores.reshape(
                (num_samples, len(motif_names) * max_scores))
    else:
        return scores
Exemplo n.º 14
0
def motifGrammarSimulation(options):
    pc = 0.001
    bestHit = options.bestHit
    pathToMotifs = options.pathToMotifs
    loadedMotifs = synthetic.LoadedEncodeMotifs(pathToMotifs,
                                                pseudocountProb=pc)
    motifName1 = options.motifName1
    motifName2 = options.motifName2
    seqLength = options.seqLength
    numSeq = options.numSeq
    generationSetting = options.generationSetting
    outputFileName = "motifGrammarSimulation_" + generationSetting + (
        "_bestHit" if bestHit else "")
    if (generationSetting is not generationSettings.singleMotif2):
        outputFileName += "_motif1-" + motifName1
    if (generationSetting is not generationSettings.singleMotif1):
        outputFileName += "_motif2-" + motifName2
    outputFileName += "_seqLength" + str(seqLength) + "_numSeq" + str(
        numSeq) + ".simdata"

    kwargs = {'loadedMotifs': loadedMotifs}
    if (bestHit):
        theClass = synthetic.BestHitPwmFromLoadedMotifs
    else:
        theClass = synthetic.PwmSamplerFromLoadedMotifs

    motif1Generator = theClass(motifName=motifName1, **kwargs)
    motif2Generator = theClass(motifName=motifName2, **kwargs)
    motif1Embedder = synthetic.SubstringEmbedder(
        substringGenerator=motif1Generator)
    motif2Embedder = synthetic.SubstringEmbedder(
        substringGenerator=motif2Generator)

    embedders = []
    if (generationSetting == generationSettings.allBackground
            or generationSetting == generationSettings.twoMotifs):
        namePrefix = "synthNeg"
    else:
        namePrefix = "synthPos"
    if (generationSetting == generationSettings.allBackground):
        pass
    elif (generationSetting in [
            generationSettings.singleMotif1, generationSettings.twoMotifs,
            generationSettings.singleMotif2
    ]):
        if (generationSetting == generationSettings.singleMotif1):
            embedders.append(motif1Embedder)
        elif (generationSetting == generationSettings.singleMotif2):
            embedders.append(motif2Embedder)
        elif (generationSetting == generationSettings.twoMotifs):
            embedders.append(motif1Embedder)
            embedders.append(motif2Embedder)
        else:
            raise RuntimeError("Unsupported generation setting: " +
                               generationSetting)
    elif (generationSetting in [
            generationSettings.twoMotifsFixedSpacing,
            generationSettings.twoMotifsVariableSpacing
    ]):
        if (generationSetting == generationSettings.twoMotifsFixedSpacing):
            separationGenerator = synthetic.FixedQuantityGenerator(
                options.fixedSpacingOrMinSpacing)
        elif (generationSetting == generationSettings.twoMotifsVariableSpacing
              ):
            separationGenerator = synthetic.UniformIntegerGenerator(
                minVal=options.fixedSpacingOrMinSpacing,
                maxVal=options.maxSpacing)
        else:
            raise RuntimeError("unsupported generationSetting:" +
                               generationSetting)
        embedders.append(
            synthetic.EmbeddableEmbedder(
                embeddableGenerator=synthetic.PairEmbeddableGenerator(
                    embeddableGenerator1=motif1Generator,
                    embeddableGenerator2=motif2Generator,
                    separationGenerator=separationGenerator)))
    else:
        raise RuntimeError("unsupported generationSetting:" +
                           generationSetting)

    embedInBackground = synthetic.EmbedInABackground(
        backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(seqLength),
        embedders=embedders,
        namePrefix=namePrefix)

    sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground, numSeq)
    synthetic.printSequences(outputFileName,
                             sequenceSet,
                             includeFasta=True,
                             includeEmbeddings=True)
Exemplo n.º 15
0
def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs,
                                   max_num_motifs, num_seqs, GC_fraction):
    """
  Generates data for multi motif recognition task.

  Parameters
  ----------
  motif_names : list
      List of strings.
  seq_length : int
  min_num_motifs : int
  max_num_motifs : int
  num_seqs : int
  GC_fraction : float

  Returns
  -------
  sequence_arr : 1darray
      Contains sequence strings.
  y : ndarray
      Contains labels for each motif.
  embedding_arr: 1darray
      Array of embedding objects.
  """

    import simdna
    from simdna import synthetic
    loaded_motifs = synthetic.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                                 pseudocountProb=0.001)

    def get_embedder(motif_name):
        substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
            loaded_motifs, motif_name)
        return synthetic.SubstringEmbedder(
            synthetic.ReverseComplementWrapper(substring_generator),
            name=motif_name)

    embedders = [get_embedder(motif_name) for motif_name in motif_names]
    quantity_generator = synthetic.UniformIntegerGenerator(
        min_num_motifs, max_num_motifs)
    combined_embedder = [
        synthetic.RandomSubsetOfEmbedders(quantity_generator, embedders)
    ]
    embed_in_background = synthetic.EmbedInABackground(
        synthetic.ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        combined_embedder)
    generated_sequences = tuple(
        synthetic.GenerateSequenceNTimes(embed_in_background,
                                         num_seqs).generateSequences())
    sequence_arr = np.array(
        [generated_seq.seq for generated_seq in generated_sequences])
    label_generator = synthetic.IsInTraceLabelGenerator(
        np.asarray(motif_names))
    y = np.array([
        label_generator.generateLabels(generated_seq)
        for generated_seq in generated_sequences
    ],
                 dtype=bool)
    embedding_arr = [
        generated_seq.embeddings for generated_seq in generated_sequences
    ]
    return sequence_arr, y, embedding_arr