import mpathic.numerics as numerics from numpy.random import randn import mpathic.Models as Models import time # Create sequences to test this on wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG' L = len(wtseq) modeltypes = ['MAT','NBR'] seqtypes = ['dna','protein'] numseqs_dict = {'dna':10000,'protein':1000} for seqtype in seqtypes: for modeltype in modeltypes: for mutrate in [0.01,0.1,1]: numseqs = numseqs_dict[seqtype] dataset_df = simulate_library(wtseq,numseq=numseqs,mutrate=mutrate,tags=True,\ dicttype=seqtype) seqarray = numerics.dataset2seqarray(dataset_df,\ modeltype=modeltype) mutarray, wtrow = numerics.dataset2mutarray(dataset_df,\ modeltype=modeltype) # Print compression results seqarray_size = numerics.nbytes(seqarray) mutarray_size = numerics.nbytes(mutarray) # Create matrix for random model alphabet = qc.seqtype_to_alphabet_dict[seqtype] C = len(alphabet) num_rows = (L-1) if modeltype=='NBR' else L num_cols = C**2 if modeltype=='NBR' else C modelmatrix = randn(num_rows,num_cols)
tmp = seqarray.copy() tmp[:,wtrow] = 0 # Store results from this chunk mutarray_lil[startrow:(endrow+1),:] = tmp # Increment rows startrow = endrow+1 endrow = startrow + chunksize - 1 # Convert to csr matrix mutarray_csr = mutarray_lil.tocsr() # Return vararray as well as binary representation of wt seq return mutarray_csr, wtrow # Create sequences to test this on wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG' dataset_df = simulate_library(wtseq,numseq=10000,mutrate=.1,tags=True) seqarray = dataset2seqarray(dataset_df, modeltype='MAT') mutarray, wtrow = dataset2mutarray(dataset_df, modeltype='MAT') # Print compression results seqarray_size = nbytes(seqarray) mutarray_size = nbytes(mutarray) print 'size of seqarray = %d'%seqarray_size print 'size of mutarray = %d'%mutarray_size print 'compression ratio = %.1f'%(1.*seqarray_size/mutarray_size)
import mpathic.numerics as numerics from numpy.random import randn import mpathic.Models as Models import time # Create sequences to test this on wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG' L = len(wtseq) modeltypes = ['MAT', 'NBR'] seqtypes = ['dna', 'protein'] numseqs_dict = {'dna': 10000, 'protein': 1000} for seqtype in seqtypes: for modeltype in modeltypes: for mutrate in [0.01, 0.1, 1]: numseqs = numseqs_dict[seqtype] dataset_df = simulate_library(wtseq,numseq=numseqs,mutrate=mutrate,tags=True,\ dicttype=seqtype) seqarray = numerics.dataset2seqarray(dataset_df,\ modeltype=modeltype) mutarray, wtrow = numerics.dataset2mutarray(dataset_df,\ modeltype=modeltype) # Print compression results seqarray_size = numerics.nbytes(seqarray) mutarray_size = numerics.nbytes(mutarray) # Create matrix for random model alphabet = qc.seqtype_to_alphabet_dict[seqtype] C = len(alphabet) num_rows = (L - 1) if modeltype == 'NBR' else L num_cols = C**2 if modeltype == 'NBR' else C modelmatrix = randn(num_rows, num_cols)