Пример #1
0
            set1[e] = set1[e][:17] + set1[e]
            set2[e] = set2[e][:17] + set2[e]
        e = e + 1

    #compute softmax scores for sequences in dataset
    scrs1 = MGlib.esfmax_score_seqs(set1, pssm, rpssm)
    scrs2 = MGlib.esfmax_score_seqs(set2, pssm, rpssm)
    if verbose: print "varied"
    if verbose:
        for s in scrs1:
            print s[0]
    if verbose: print "loners"
    if verbose:
        for s in scrs2:
            print s[0]
    #get log-likelihoods for sequences in dataset
    llrs1 = MGlib.ll_ratios(scrs1, n_g, n_m, alpha)
    llrs2 = MGlib.ll_ratios(scrs2, n_g, n_m, alpha)

    #get overall posterior for the sequences in dataset
    fposts1 = MGlib.PostP(llrs1, PPR, 0)
    fposts2 = MGlib.PostP(llrs2, PPR, 0)

    #write results to file
    out_file.write(str(fposts1))
    out_file.write(',')
    out_file.write(str(fposts2))
    out_file.write('\n')

out_file.close()
def main():
    ###############################################################################
    #set default parameters
    motif_filename = "CsoR.txt"  #input file
    out_filename = "cog_exp_sym2_c"  #prefix for output
    verbose = 0  #verbose mode
    alpha = 1.0 / 300.0  #mixing ratio for regulated model
    rproms = 3.0  #number of regulated promoters [prior]
    tproms = 1811.0  #total number of promoters in genome [prior]

    # control number of cogs and number of permutations
    num_cogs = 10000
    neg_cutoff = 9900  # Cog #'s less than this are negative
    num_perms = 100
    cog_sample_size = 1000

    #verbose
    if verbose: print "Using: ", motif_filename, " as input"
    if verbose:        print "Writing to (suffix): ", "[void]" if out_filename==""\
else out_filename

    #open file for ouput
    try:
        out_file = open(
            out_filename + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" +
            str(num_perms) + ".csv", "w")
    except (IOError, OSError) as file_open_exception:
        print "*** Something went wrong while opening the output file"
        print "*** Error: ", file_open_exception.errno, " - ",\
                             file_open_exception.strerror
        sys.exit()

    #compute priors
    PR = rproms / tproms  #prior probability of regulation
    PB = 1.0 - PR  #prior probability of non-regulation
    PPR = PB / PR  #prior probability ratio

    # read motif and assign 0.25 pseudocounts to PSWM
    # also assign background uniform distribution for the PSSM (default)
    mot = MGlib.read_motif(motif_filename)
    mot.pseudocounts = 1
    mot.background = None

    # save the pssm for the motif and the reverse complement
    #(so that they are not recalculated everytime we invoke motif.pssm)
    pssm = mot.pssm
    rpssm = pssm.reverse_complement()

    # Save the motif itself as a list of strings for later permuting
    motif_sites = []
    num_motif_sites = len(mot.instances)
    for i in range(num_motif_sites):
        motif_sites.append(str(mot.instances[i]))

    random.seed(None)

    # Create the COGS
    all_cogs = []
    the_neg_seqs = []
    neg_cog_nums = [i for i in range(0, neg_cutoff)]
    ran_neg_cog_nums = sample(cog_sample_size, neg_cog_nums, replace=False)
    cog_file = open(
        "seqs_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" +
        str(num_perms) + ".csv", "w")
    for i in range(0, num_cogs):
        label = get_cog_type(i, neg_cutoff)
        #print "Create cog #", i, label
        cur_cog = create_COG(label, mot)
        all_cogs.append(cur_cog)
        if i in ran_neg_cog_nums:
            # A negatively regulated cog
            for s in cur_cog:
                the_neg_seqs.append(s)
                cog_file.write("%d,%s\n" % (i, s))
        else:
            for s in cur_cog:
                cog_file.write("%d,%s\n" % (i, s))
    cog_file.close()

    # compute softmax scores for sampled background sequences
    gscr = MGlib.esfmax_score_seqs(the_neg_seqs, pssm, rpssm)
    # compute softmax scores for motif sequences
    mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm)

    # get normal distributions for background and motif
    mean_gscr = mean(gscr)
    std_gscr = std(gscr)
    n_g = norm(mean_gscr, std_gscr)
    mean_mscr = mean(mscr)
    std_mscr = std(mscr)
    n_m = norm(mean(mscr), std(mscr))

    smeans_file = open(
        "smeans_stds_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) +
        "_p" + str(num_perms) + ".csv", "w")
    smeans_file.write("PSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr))
    smeans_file.write("PSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr))

    # Create the permuted pssm and n_m and n_g for the permutation tests
    new_pssm_list = []
    rnew_pssm_list = []
    n_m_perms = []
    n_g_perms = []

    for j in range(0, num_perms):
        #print "\n***************** Create permutation #", j
        # permute the columns of the motif
        new_mot = sym_permute_motif(motif_sites)
        new_pssm = new_mot.pssm  #
        rnew_pssm = new_pssm.reverse_complement()
        new_pssm_list.append(new_pssm)
        rnew_pssm_list.append(rnew_pssm)
        # compute score for the negative sequences
        gscr = MGlib.esfmax_score_seqs(the_neg_seqs, new_pssm, rnew_pssm)
        mean_gscr = mean(gscr)
        std_gscr = std(gscr)
        # compute softmax scores for new motif sequences
        mscr = MGlib.esfmax_score_seqs(new_mot.instances, new_pssm, rnew_pssm)
        mean_mscr = mean(mscr)
        std_mscr = std(mscr)
        smeans_file.write("PermPSSM n_g,%13.10f,%13.10f\n" %
                          (mean_gscr, std_gscr))
        smeans_file.write("PermPSSM n_m,%13.10f,%13.10f\n" %
                          (mean_mscr, std_mscr))

        # get normal distributions for background and motif
        n_g_temp = norm(mean_gscr, std_gscr)
        n_g_perms.append(n_g_temp)
        n_m_temp = norm(mean_mscr, std_mscr)
        n_m_perms.append(n_m_temp)
    smeans_file.close()

    # write csv header
    out_file.write(
        'COG Num,Pos/Neg Regulated,Posterior,LogLikelihood,True Model LL,LL Pval\n'
    )

    # For each cog, do the posterior calculation and the permutation tests
    for i in range(0, num_cogs):
        label = get_cog_type(i, neg_cutoff)
        #print "Test Cog:", i,label
        # The original posterior computation
        #compute softmax scores for sequences in dataset
        scrs = MGlib.esfmax_score_seqs(all_cogs[i], pssm, rpssm)
        #print np.min(scrs[0]), np.max(scrs[0])
        # Compute posterior
        # get log-likelihoods for sequences in dataset
        llrs = MGlib.ll_ratios(scrs, n_g, n_m, alpha)
        # get per-sequence posterior for the sequences in dataset
        fpost = MGlib.PostP(llrs, PPR, 0)

        true_model_ll = compute_log_l(scrs, n_g, n_m, alpha)

        #####################################
        # Permutation test
        log_ls = []
        for j in range(0, num_perms):
            #print " ... perm test", j
            # Compute score and log likelihood for each permutation.
            scrs = MGlib.esfmax_score_seqs(all_cogs[i], new_pssm_list[j],
                                           rnew_pssm_list[j])
            log_l = compute_log_l(scrs, n_g_perms[j], n_m_perms[j], alpha)
            log_ls.append(log_l)

        rev_pval = compute_p_val(log_ls, true_model_ll)
        pval = 1.0 - rev_pval
        out_file.write("%d,%s,%10.7f,%10.7f,%10.7f,%10.7f\n" %
                       (i, label, fpost, rev_pval, true_model_ll, pval))

    out_file.close()
Пример #3
0
    n_g = norm(mean(gscr), std(gscr))
    n_m = norm(mean(mscr), std(mscr))

    #create motif instances
    pmot1 = MGlib.sample_motif(mot, 100)

    #insert sites in sequences
    e = 0
    while (e < len(set1)):
        set1[e] = pmot1[e] + set1[e]
        e = e + 1

    #compute softmax scores for sequences in dataset
    scrs = MGlib.esfmax_score_seqs(set1, pssm, rpssm)
    #get log-likelihoods for sequences in dataset
    llrs = MGlib.ll_ratios(scrs, n_g, n_m, alpha)

    #get per-sequence posterior for the sequences in dataset
    fposts = MGlib.PostP(llrs, PPR, 1)

    #write results to file
    e = 0
    while (e < len(set1)):
        out_file.write(str(scrs[e][0]))
        out_file.write(',')
        out_file.write(str(fposts[e]))
        out_file.write('\n')
        e = e + 1

out_file.close()
def main():
    ###############################################################################
    # set default parameters
    motif_filename = "CsoR.txt"  # input file
    out_filename = "cog_exp_sym2_c"  # prefix for output
    verbose = 0  # verbose mode
    alpha = 1.0 / 300.0  # mixing ratio for regulated model
    rproms = 3.0  # number of regulated promoters [prior]
    tproms = 1811.0  # total number of promoters in genome [prior]

    # control number of cogs and number of permutations
    num_cogs = 10000
    neg_cutoff = 9900  # Cog #'s less than this are negative
    num_perms = 100
    cog_sample_size = 1000

    # verbose
    if verbose:
        print "Using: ", motif_filename, " as input"
    if verbose:
        print "Writing to (suffix): ", "[void]" if out_filename == "" else out_filename

    # open file for ouput
    try:
        out_file = open(
            out_filename + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w"
        )
    except (IOError, OSError) as file_open_exception:
        print "*** Something went wrong while opening the output file"
        print "*** Error: ", file_open_exception.errno, " - ", file_open_exception.strerror
        sys.exit()

    # compute priors
    PR = rproms / tproms  # prior probability of regulation
    PB = 1.0 - PR  # prior probability of non-regulation
    PPR = PB / PR  # prior probability ratio

    # read motif and assign 0.25 pseudocounts to PSWM
    # also assign background uniform distribution for the PSSM (default)
    mot = MGlib.read_motif(motif_filename)
    mot.pseudocounts = 1
    mot.background = None

    # save the pssm for the motif and the reverse complement
    # (so that they are not recalculated everytime we invoke motif.pssm)
    pssm = mot.pssm
    rpssm = pssm.reverse_complement()

    # Save the motif itself as a list of strings for later permuting
    motif_sites = []
    num_motif_sites = len(mot.instances)
    for i in range(num_motif_sites):
        motif_sites.append(str(mot.instances[i]))

    random.seed(None)

    # Create the COGS
    all_cogs = []
    the_neg_seqs = []
    neg_cog_nums = [i for i in range(0, neg_cutoff)]
    ran_neg_cog_nums = sample(cog_sample_size, neg_cog_nums, replace=False)
    cog_file = open("seqs_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w")
    for i in range(0, num_cogs):
        label = get_cog_type(i, neg_cutoff)
        # print "Create cog #", i, label
        cur_cog = create_COG(label, mot)
        all_cogs.append(cur_cog)
        if i in ran_neg_cog_nums:
            # A negatively regulated cog
            for s in cur_cog:
                the_neg_seqs.append(s)
                cog_file.write("%d,%s\n" % (i, s))
        else:
            for s in cur_cog:
                cog_file.write("%d,%s\n" % (i, s))
    cog_file.close()

    # compute softmax scores for sampled background sequences
    gscr = MGlib.esfmax_score_seqs(the_neg_seqs, pssm, rpssm)
    # compute softmax scores for motif sequences
    mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm)

    # get normal distributions for background and motif
    mean_gscr = mean(gscr)
    std_gscr = std(gscr)
    n_g = norm(mean_gscr, std_gscr)
    mean_mscr = mean(mscr)
    std_mscr = std(mscr)
    n_m = norm(mean(mscr), std(mscr))

    smeans_file = open(
        "smeans_stds_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w"
    )
    smeans_file.write("PSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr))
    smeans_file.write("PSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr))

    # Create the permuted pssm and n_m and n_g for the permutation tests
    new_pssm_list = []
    rnew_pssm_list = []
    n_m_perms = []
    n_g_perms = []

    for j in range(0, num_perms):
        # print "\n***************** Create permutation #", j
        # permute the columns of the motif
        new_mot = sym_permute_motif(motif_sites)
        new_pssm = new_mot.pssm  #
        rnew_pssm = new_pssm.reverse_complement()
        new_pssm_list.append(new_pssm)
        rnew_pssm_list.append(rnew_pssm)
        # compute score for the negative sequences
        gscr = MGlib.esfmax_score_seqs(the_neg_seqs, new_pssm, rnew_pssm)
        mean_gscr = mean(gscr)
        std_gscr = std(gscr)
        # compute softmax scores for new motif sequences
        mscr = MGlib.esfmax_score_seqs(new_mot.instances, new_pssm, rnew_pssm)
        mean_mscr = mean(mscr)
        std_mscr = std(mscr)
        smeans_file.write("PermPSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr))
        smeans_file.write("PermPSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr))

        # get normal distributions for background and motif
        n_g_temp = norm(mean_gscr, std_gscr)
        n_g_perms.append(n_g_temp)
        n_m_temp = norm(mean_mscr, std_mscr)
        n_m_perms.append(n_m_temp)
    smeans_file.close()

    # write csv header
    out_file.write("COG Num,Pos/Neg Regulated,Posterior,LogLikelihood,True Model LL,LL Pval\n")

    # For each cog, do the posterior calculation and the permutation tests
    for i in range(0, num_cogs):
        label = get_cog_type(i, neg_cutoff)
        # print "Test Cog:", i,label
        # The original posterior computation
        # compute softmax scores for sequences in dataset
        scrs = MGlib.esfmax_score_seqs(all_cogs[i], pssm, rpssm)
        # print np.min(scrs[0]), np.max(scrs[0])
        # Compute posterior
        # get log-likelihoods for sequences in dataset
        llrs = MGlib.ll_ratios(scrs, n_g, n_m, alpha)
        # get per-sequence posterior for the sequences in dataset
        fpost = MGlib.PostP(llrs, PPR, 0)

        true_model_ll = compute_log_l(scrs, n_g, n_m, alpha)

        #####################################
        # Permutation test
        log_ls = []
        for j in range(0, num_perms):
            # print " ... perm test", j
            # Compute score and log likelihood for each permutation.
            scrs = MGlib.esfmax_score_seqs(all_cogs[i], new_pssm_list[j], rnew_pssm_list[j])
            log_l = compute_log_l(scrs, n_g_perms[j], n_m_perms[j], alpha)
            log_ls.append(log_l)

        rev_pval = compute_p_val(log_ls, true_model_ll)
        pval = 1.0 - rev_pval
        out_file.write("%d,%s,%10.7f,%10.7f,%10.7f,%10.7f\n" % (i, label, fpost, rev_pval, true_model_ll, pval))

    out_file.close()
Пример #5
0
    n_g=norm(mean(gscr), std(gscr))
    n_m=norm(mean(mscr), std(mscr))
       
    #create motif instances
    pmot1 = MGlib.sample_motif(mot,100)
    
    #insert sites in sequences
    e=0
    while (e<len(set1)):
        set1[e] = pmot1[e] + set1[e]
        e = e+1
    
    #compute softmax scores for sequences in dataset
    scrs=MGlib.esfmax_score_seqs(set1,pssm,rpssm)
    #get log-likelihoods for sequences in dataset        
    llrs=MGlib.ll_ratios(scrs,n_g,n_m,alpha)
    
    #get per-sequence posterior for the sequences in dataset
    fposts=MGlib.PostP(llrs,PPR,1)
    
    #write results to file
    e=0
    while (e<len(set1)):
        out_file.write(str(scrs[e][0]))
        out_file.write(',')
        out_file.write(str(fposts[e]))
        out_file.write('\n')
        e=e+1

out_file.close()
Пример #6
0
            sz=3
        elif (cnt2==1):
		sz=6
        elif (cnt2==2):
            sz=9
        elif (cnt2==3):
            sz=12
        elif (cnt2==4):
            sz=15
        else:
            sz=18
            
        #create mixed dataset score
        curr_scores=scrs1[0:sz]+scrs2[sz:len(scrs1)]

        #get log-likelihoods for sequences in dataset        
        llrs1=MGlib.ll_ratios(curr_scores,n_g,n_m,alpha)
        
        #get overall normalized posterior for the sequences in dataset
        posts1=MGlib.PostP(llrs1,PPR)

        if verbose: print sz, " - ", posts1

        #write results to file
        out_file.write(str(sz))
        out_file.write(',')
        out_file.write(str(posts1))
        out_file.write('\n')
            
        
out_file.close()
Пример #7
0
        if (cnt2 == 0):
            sz = 3
        elif (cnt2 == 1):
            sz = 6
        elif (cnt2 == 2):
            sz = 9
        elif (cnt2 == 3):
            sz = 12
        elif (cnt2 == 4):
            sz = 15
        else:
            sz = 18

        #create mixed dataset score
        curr_scores = scrs1[0:sz] + scrs2[sz:len(scrs1)]

        #get log-likelihoods for sequences in dataset
        llrs1 = MGlib.ll_ratios(curr_scores, n_g, n_m, alpha)

        #get overall normalized posterior for the sequences in dataset
        posts1 = MGlib.PostP(llrs1, PPR)

        if verbose: print sz, " - ", posts1

        #write results to file
        out_file.write(str(sz))
        out_file.write(',')
        out_file.write(str(posts1))
        out_file.write('\n')

out_file.close()
Пример #8
0
            
            #compute revised priors (assuming 300 bp average length)
            aPR, aPB = MGlib.NormPriors(th, n_g, n_m, alpha, rproms,\
                                        tproms, promlen=300.0)
            #get revised prior ratio
            aPPR = aPB/aPR


            #FILTER sequences not matching theta
            #get list of sequences with min score > th
            Nscrs1 = [x for x in scrs1 if max(x)>=th]

            if verbose: print "Length of post-theta bckg seqs: ", len(Nscrs1)

            #get log-likelihoods for sequences in dataset        
            Nllrs1=MGlib.ll_ratios(Nscrs1,n_g,n_m,alpha)
            
            #get normalization factors
            Nnormfs1=MGlib.lNormFactor(Nscrs1, th, n_g, n_m, alpha)

            #get overall normalized posterior for the sequences in dataset
            Nposts1=MGlib.NormPostP(Nllrs1,aPPR,Nnormfs1,0)

            if verbose: print theta, " - ", sz, " - ", Nposts1, " - ", \
                              len(Nscrs1), 1/(1+aPPR)
            #write results to file
            out_file.write(str(theta))
            out_file.write(',')
            out_file.write(str(sz))
            out_file.write(',')
            out_file.write(str(Nposts1))
def main():
    """Gets a motif from file and reads it. It then generates synthetic data
       to represent a set of promoters (100) mapping to a particular eggNOG/COG,
       inserts into these sequences pseudosites (generated from the 
       distribution implicit in the PSSM). It then calls the PSSM evaluation
       function to score the sites using the softmax function and then the
       different functions to compute the likelihoods and the posterior
       probabilities.
       
       Usage:
       
       MG_synth -M <Motif file> -O <out file prefix> -E <experiment> \
                -A <alpha mix ratio> -P <Regulation prior> -T <theta> \
                -V <verbose mode>
       
       
       Note: motifs are assumed to be in FASTA or 1-per-line text format
    """
     
    #set default parameters
    motif_filename="CsoR.txt"   #input file
    out_filename="_o"           #o prefix for output
    verbose=0                   #verbose mode
    alpha=1.0/300.0             #mixing ratio for regulated model
    rproms=3.0                  #number of regulated promoters [prior]
    tproms=1811.0               #total number of promoters in genome [prior]
    experiment=2                #the experiment number
    
    #get cmd parameters
    try:
        opts, args=getopt.getopt(sys.argv[1:],"I:O:V")
    except getopt.GetoptError:
        print 'MG_synth -M <Motif file> -O <out file prefix> -E <experiment> \
                -A <alpha mix ratio> -P <Regulation prior> -T <theta> \
                -V <verbose mode>'
        sys.exit(2) 
        
    #assign parameters
    for opt, arg in opts:
        if opt == '-M':
            motif_filename=arg
        elif opt == '-O':
            out_filename=arg
        elif opt == '-E':
            experiment=int(arg)
        elif opt == '-A':
            alpha=float(arg)
        elif opt == '-P':
            PR=float(arg)
        elif opt == '-T':
            theta=float(arg)
        elif opt == '-V':
            verbose=arg
        elif opt == '-askme':
            motif_filename = raw_input('Enter the motif file name\n')
            out_filename = raw_input('Enter the output file name prefix\n')
            experiment = raw_input('Enter the experiment number\n')
            alpha = raw_input('Enter the alpha mixing ratio\n')
            PR = raw_input('Enter the prior probability for regulation\n')
            theta = raw_input('Enter the theta sensitivity threshold\n')
            verbose = raw_input('Enter verbose mode (1/0)\n')
            
    out_filename=motif_filename.split(".")[0] + out_filename + str(experiment)
    
    #verbose
    if verbose: print "Using: ", motif_filename, " as input"
    if verbose: print "Writing to (suffix): ", "[void]" if out_filename==""\
    else out_filename
    
    #open files for ouput
    try:
        out_file = open(out_filename + ".csv","w")
    except (IOError, OSError) as file_open_exception:
        print "*** Something went wrong while opening the output file"
        print "*** Error: ", file_open_exception.errno, " - ",\
                             file_open_exception.strerror        
        sys.exit()    
    
    #open file for error recording
    try:
        err_file = open(out_filename+".err","w")
    except (IOError, OSError) as file_open_exception:
        print "Something went wrong while opening the error file"
        sys.exit()

    #compute priors
    PR=rproms/tproms               #prior probability of regulation
    PB=1.0-PR                      #prior probability of non-regulation
    PPR=PB/PR                      #prior probability ratio
        
    #read motif and assing 0.25 pseudocounts to PSWM
    #also assign background uniform distribution for the PSSM (default)
    mot = MGlib.read_motif(motif_filename)
    mot.pseudocounts=0.25
    mot.background=None
    
    #save the pssm for the motif and the reverse complement
    #(so that they are not recalculated everytime we invoke motif.pssm)
    pssm = mot.pssm
    rpssm = pssm.reverse_complement()

    if (experiment==0):
        #-------------------------------------------------------------------------
        #Experiment 0:
        #10000 sequences, with 100% on average having random pseudo-sites inserted
        #all sites are inserted at the first position of the sequence
        #get individual sequence posteriors and write them together with the
        #score of the site inserted
        random.seed(None)
        
        #create background sequence set: 100 seqs 283 bp long
        set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},10000)
        #compute softmax scores for background sequences in dataset
        gscr = MGlib.sfmax_score_seqs(set1,pssm,rpssm)
     
        #get normal distributions for background and motif
        n_g=norm(mean(gscr), std(gscr))
        n_m=norm(pssm.mean(), pssm.std())
       
        #create motif instances
        pmot1 = MGlib.sample_motif(mot,1000)

        #insert sites in sequences
        e=0
        while (e<len(set1)):
            r = random.random()
            #determine if site is to be inserted and insert random site
            if (r<1): 
                set1[e] = random.choice(pmot1) + set1[e]
            #otherwise insert random sequence from own sequence start
            else :
                set1[e] = set1[e][:17] + set1[e]
            e = e+1

        #compute softmax scores for sequences in dataset
        scrs=MGlib.sfmax_score_seqs(set1,pssm,rpssm)
        #get log-likelihoods for sequences in dataset        
        llrs=MGlib.ll_ratios(scrs,n_g,n_m,alpha)
        
        #get per-sequence posterior for the sequences in dataset
        fposts=MGlib.PostP(llrs,PPR,1)

        #write results to file
        e=0
        while (e<len(set1)):
            out_file.write(str(scrs[e][0]))
            out_file.write(',')
            out_file.write(str(fposts[e]))
            out_file.write('\n')
            e=e+1

        out_file.close()
        return 0
    
    elif (experiment==1):
        #-------------------------------------------------------------------------
        #Experiment 1:
        #2x100 sequences, with the first 10 having random pseudo-sites inserted
        #all sites are inserted at the first position of the sequence
        #get individual sequence posteriors and write them together with the
        #score of the site inserted
        #first sequence set includes randomly distributed sites
        #second sequence set includes only one site
        #experiment is repeated 100 times
        random.seed(None)
        
        for cnt in range(0,100):
            #create background sequence set: 100 seqs 283 bp long
            set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},100)
            set2 = set1[:]
            #compute softmax scores for background sequences in dataset
            gscr = MGlib.sfmax_score_seqs(set1,pssm,rpssm)
         
            #get normal distributions for background and motif
            n_g=norm(mean(gscr), std(gscr))
            n_m=norm(pssm.mean(), pssm.std())
           
            #create motif instances
            pmot1 = MGlib.sample_motif(mot,100)
            pmot2 = MGlib.sample_motif(mot,1)
            
            print cnt
            #insert sites in sequences
            e=0
            while (e<len(set1)):
                r = random.random()
                #insert random site in first 10 sequences 
                if (e<11): 
                    set1[e] = random.choice(pmot1) + set1[e]
                    set2[e] = random.choice(pmot2) + set2[e]
                #otherwise insert random sequence from own sequence start
                else :
                    set1[e] = set1[e][:17] + set1[e]
                    set2[e] = set2[e][:17] + set2[e]
                e = e+1
    
            #compute softmax scores for sequences in dataset
            scrs1=MGlib.sfmax_score_seqs(set1,pssm,rpssm)
            scrs2=MGlib.sfmax_score_seqs(set2,pssm,rpssm)
            if verbose: print "varied"
            if verbose: 
                for s in scrs1: print s[0]
            if verbose: print "loners"
            if verbose: 
                for s in scrs2: print s[0]
            #get log-likelihoods for sequences in dataset        
            llrs1=MGlib.ll_ratios(scrs1,n_g,n_m,alpha)
            llrs2=MGlib.ll_ratios(scrs2,n_g,n_m,alpha)
            
            #get overall posterior for the sequences in dataset
            fposts1=MGlib.PostP(llrs1,PPR,0)
            fposts2=MGlib.PostP(llrs2,PPR,0)
    
            #write results to file
            out_file.write(str(fposts1))
            out_file.write(',')
            out_file.write(str(fposts2))
            out_file.write('\n')
            
        out_file.close()
        return 0
Пример #10
0
        else :
            set1[e] = set1[e][:17] + set1[e]
            set2[e] = set2[e][:17] + set2[e]
        e = e+1

    #compute softmax scores for sequences in dataset
    scrs1=MGlib.esfmax_score_seqs(set1,pssm,rpssm)
    scrs2=MGlib.esfmax_score_seqs(set2,pssm,rpssm)
    if verbose: print "varied"
    if verbose: 
        for s in scrs1: print s[0]
    if verbose: print "loners"
    if verbose: 
        for s in scrs2: print s[0]
    #get log-likelihoods for sequences in dataset        
    llrs1=MGlib.ll_ratios(scrs1,n_g,n_m,alpha)
    llrs2=MGlib.ll_ratios(scrs2,n_g,n_m,alpha)
    
    #get overall posterior for the sequences in dataset
    fposts1=MGlib.PostP(llrs1,PPR,0)
    fposts2=MGlib.PostP(llrs2,PPR,0)

    #write results to file
    out_file.write(str(fposts1))
    out_file.write(',')
    out_file.write(str(fposts2))
    out_file.write('\n')
    
out_file.close()