Exemplo n.º 1
0
import mpathic.numerics as numerics
from numpy.random import randn
import mpathic.Models as Models
import time

# Create sequences to test this on
wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG'
L = len(wtseq)
modeltypes = ['MAT','NBR']
seqtypes = ['dna','protein']
numseqs_dict = {'dna':10000,'protein':1000}
for seqtype in seqtypes:
    for modeltype in modeltypes:
        for mutrate in [0.01,0.1,1]:
            numseqs = numseqs_dict[seqtype]
            dataset_df = simulate_library(wtseq,numseq=numseqs,mutrate=mutrate,tags=True,\
                dicttype=seqtype)
            seqarray = numerics.dataset2seqarray(dataset_df,\
                modeltype=modeltype)
            mutarray, wtrow = numerics.dataset2mutarray(dataset_df,\
                modeltype=modeltype)

            # Print compression results
            seqarray_size = numerics.nbytes(seqarray)
            mutarray_size = numerics.nbytes(mutarray)

            # Create matrix for random model
            alphabet = qc.seqtype_to_alphabet_dict[seqtype]
            C = len(alphabet)
            num_rows = (L-1) if modeltype=='NBR' else L
            num_cols = C**2 if modeltype=='NBR' else C
            modelmatrix = randn(num_rows,num_cols)
Exemplo n.º 2
0
        tmp = seqarray.copy()
        tmp[:,wtrow] = 0

        # Store results from this chunk
        mutarray_lil[startrow:(endrow+1),:] = tmp

        # Increment rows
        startrow = endrow+1
        endrow = startrow + chunksize - 1

    # Convert to csr matrix
    mutarray_csr = mutarray_lil.tocsr()

    # Return vararray as well as binary representation of wt seq
    return mutarray_csr, wtrow


# Create sequences to test this on
wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG'
dataset_df = simulate_library(wtseq,numseq=10000,mutrate=.1,tags=True)
seqarray = dataset2seqarray(dataset_df, modeltype='MAT')
mutarray, wtrow = dataset2mutarray(dataset_df, modeltype='MAT')

# Print compression results
seqarray_size = nbytes(seqarray)
mutarray_size = nbytes(mutarray)

print 'size of seqarray = %d'%seqarray_size
print 'size of mutarray = %d'%mutarray_size
print 'compression ratio = %.1f'%(1.*seqarray_size/mutarray_size)
Exemplo n.º 3
0
import mpathic.numerics as numerics
from numpy.random import randn
import mpathic.Models as Models
import time

# Create sequences to test this on
wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG'
L = len(wtseq)
modeltypes = ['MAT', 'NBR']
seqtypes = ['dna', 'protein']
numseqs_dict = {'dna': 10000, 'protein': 1000}
for seqtype in seqtypes:
    for modeltype in modeltypes:
        for mutrate in [0.01, 0.1, 1]:
            numseqs = numseqs_dict[seqtype]
            dataset_df = simulate_library(wtseq,numseq=numseqs,mutrate=mutrate,tags=True,\
                dicttype=seqtype)
            seqarray = numerics.dataset2seqarray(dataset_df,\
                modeltype=modeltype)
            mutarray, wtrow = numerics.dataset2mutarray(dataset_df,\
                modeltype=modeltype)

            # Print compression results
            seqarray_size = numerics.nbytes(seqarray)
            mutarray_size = numerics.nbytes(mutarray)

            # Create matrix for random model
            alphabet = qc.seqtype_to_alphabet_dict[seqtype]
            C = len(alphabet)
            num_rows = (L - 1) if modeltype == 'NBR' else L
            num_cols = C**2 if modeltype == 'NBR' else C
            modelmatrix = randn(num_rows, num_cols)