示例#1
0
def main(
        data_df,model_df,
        start=0,end=None,err=False,coarse_graining_level=0):
    dicttype, modeltype = qc.get_model_type(model_df)
    seq_cols = qc.get_cols_from_df(data_df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end)
        if modeltype=='MAT':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']):
                raise SortSeqError('model length does not match dataset length')
        elif modeltype=='NBR':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1:
                raise SortSeqError('model length does not match dataset length')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
                data_df['ct'] = data_df[col_headers].sum(axis=1)
    data_df = data_df[data_df.ct != 0]        
    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end-start
    data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] 
    #make a numpy array out of the model data frame
    model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))]
    value = np.transpose(np.array(model_df[model_df_headers]))  
    #now we evaluate the expression of each sequence according to the model.
    seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype)
    temp_df = data_df.copy()
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) 
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True,drop=True)
    #we must divide by the total number of counts in each bin for the MI calculator
    #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)     
    MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=coarse_graining_level)
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2))
            sub_df.reset_index(inplace=True,drop=True)
            sub_MI[i],sub_std = main(
                sub_df,model_df,err=False)
        Std = np.std(sub_MI)/np.sqrt(2)
    return MI,Std
示例#2
0
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='NBR':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]+1

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
示例#3
0
def Berg_von_Hippel(df,dicttype,foreground=1,background=0,pseudocounts=1):
    '''Learn models using berg von hippel model. The foreground sequences are
         usually bin_1 and background in bin_0, this can be changed via flags.''' 
    seq_dict,inv_dict = utils.choose_dict(dicttype)
    #check that the foreground and background chosen columns actually exist.
    columns_to_check = {'ct_' + str(foreground),'ct_' + str(background)}
    if not columns_to_check.issubset(set(df.columns)):
        raise SortSeqError('Foreground or Background column does not exist!')

    #get counts of each base at each position
    foreground_counts = utils.profile_counts(df,dicttype,bin_k=foreground)   
    background_counts = utils.profile_counts(df,dicttype,bin_k=background)
    binheaders = utils.get_column_headers(foreground_counts)
    #add pseudocounts to each position
    foreground_counts[binheaders] = foreground_counts[binheaders] + pseudocounts
    background_counts[binheaders] = background_counts[binheaders] + pseudocounts
    #make sure there are no zeros in counts after addition of pseudocounts
    ct_headers = utils.get_column_headers(foreground_counts)
    if foreground_counts[ct_headers].isin([0]).values.any():
        raise SortSeqError('''There are some bases without any representation in\
            the foreground data, you should use pseudocounts to avoid failure \
            of the learning method''')
    if background_counts[ct_headers].isin([0]).values.any():
        raise SortSeqError('''There are some bases without any representation in\
            the background data, you should use pseudocounts to avoid failure \
            of the learning method''')
    #normalize to compute frequencies
    foreground_freqs = foreground_counts.copy()
    background_freqs = background_counts.copy()
    foreground_freqs[binheaders] = foreground_freqs[binheaders].div(
        foreground_freqs[binheaders].sum(axis=1),axis=0)
    background_freqs[binheaders] = background_freqs[binheaders].div(
        background_freqs[binheaders].sum(axis=1),axis=0)
    
    output_df = -np.log(foreground_freqs/background_freqs)
    #change column names accordingly (instead of ct_ we want val_)
    rename_dict = {'ct_' + str(inv_dict[i]):'val_' + str(inv_dict[i]) for i in range(len(seq_dict))}
    output_df = output_df.rename(columns=rename_dict)
    return output_df
示例#4
0
x = fast.seq2sitelist(seq, site_length, rc=True, safe=False)
c_time = time.time() - t
print 'cython, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\
    (c_time,len(x))

print '%.1f-fold speedup.' % (p_time / c_time)

print '-----------------------------'
# Test seqs2array_for_matmodel

sites = fast.seq2sitelist(seq, 20)
site_length = len(sites[0])
num_sites = len(sites)

t = time.time()
seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype='MAT')
sitearray_p = np.zeros([num_sites, (site_length * len(seq_dict))], dtype=int)
for i, site in enumerate(sites):
    sitearray_p[i, :] = utils.seq2mat(site, seq_dict).ravel()
p_time = time.time() - t
print 'python, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\
    (p_time,num_sites,seqtype,site_length)

t = time.time()
sitearray = fast.seqs2array_for_matmodel(sites, seqtype, safe=False)
c_time = time.time() - t
print 'cython, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\
    (c_time,num_sites,seqtype,site_length)

print '%.1f-fold speedup.' % (p_time / c_time)
示例#5
0
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\
    db=None,iteration=30000,burnin=1000,thin=10,\
    runnum=0,initialize='LS',start=0,end=None,foreground=1,\
    background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\
    verbose=False):
    
    # Determine dictionary
    seq_cols = qc.get_cols_from_df(df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    
    '''Check to make sure the chosen dictionary type correctly describes
         the sequences. An issue with this test is that if you have DNA sequence
         but choose a protein dictionary, you will still pass this test bc A,C,
         G,T are also valid amino acids'''
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
    #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
    #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True,inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = utils.collapse_further(df)
    col_headers = utils.get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)
    #Do something different for each type of learning method (lm)
    if lm == 'ER':
        emat = Berg_von_Hippel(
            df,dicttype,foreground=foreground,background=background,
            pseudocounts=pseudocounts)
    if lm == 'LS':
        '''First check that is we don't have a penalty for ridge regression,
            that we at least have all possible base values so that the analysis
            will not fail'''
        if LS_means_std: #If user supplied preset means and std for each bin
            means_std_df = io.load_meanstd(LS_means_std)

            #change bin number to 'ct_number' and then use as index
            labels = list(means_std_df['bin'].apply(add_label))
            std = means_std_df['std']
            std.index = labels
            #Change Weighting of each sequence by dividing counts by bin std
            df[labels] = df[labels].div(std)
            means = means_std_df['mean']
            means.index = labels
        else:
            means = None
        #drop all rows without counts
        df['ct'] = df[col_headers].sum(axis=1)
        df = df[df.ct != 0]        
        df.reset_index(inplace=True,drop=True)
        ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
            expression even though it is will be calculated as such if we proceed.
            Therefore is drop_library is passed, drop this column from analysis.'''
        if drop_library:
            try:     
                df.drop('ct_0',inplace=True)
                col_headers = utils.get_column_headers(df)
                if len(col_headers) < 2:
                    raise SortSeqError(
                        '''After dropping library there are no longer enough 
                        columns to run the analysis''')
            except:
                raise SortSeqError('''drop_library option was passed, but no ct_0
                    column exists''')
        #parameterize sequences into 3xL vectors
                               
        raveledmat,batch,sw = utils.genweightandmat(
                                  df,par_seq_dict,dicttype,means=means,modeltype=modeltype)
        #Use ridge regression to find matrix.       
        emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha)

    if lm == 'IM':
        seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype)
        #this is also an MCMC routine, do the same as above.
        if initialize == 'rand':
            if modeltype == 'MAT':
                emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))
            elif modeltype == 'NBR':
                emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict))
        elif initialize == 'LS':
            emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
            emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose)
            emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))   
            #pymc doesn't take sparse mat        
        emat = MaximizeMI_memsaver(
                seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin,
                thin=thin,runnum=runnum,verbose=verbose)
    #now format the energy matrices to get them ready to output
    if (lm == 'IM' or lm == 'memsaver'):       
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat))
    
    elif lm == 'ER': 
        '''the emat for this format is currently transposed compared to other formats
        it is also already a data frame with columns [pos,val_...]'''
        emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        emat_typical = emat[emat_cols]
        emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
        
    else: #must be Least squares
        emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict))        
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat_typical))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
    
    em = pd.DataFrame(emat_typical)
    em.columns = val_cols
    #add position column
    if modeltype == 'NBR':
        pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') 
    else:
        pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos')    
    output_df = pd.concat([pos,em],axis=1)

    # Validate model and return
    output_df = qc.validate_model(output_df,fix=True)
    return output_df
示例#6
0
def main(model_df, contig_list, numsites=10, verbose=False):

    # Determine type of string from model
    qc.validate_model(model_df)
    seqtype, modeltype = qc.get_model_type(model_df)
    seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype)

    # Check that all characters are from the correct alphabet
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    search_string = r"[^%s]" % alphabet
    for contig_str, contig_name, pos_offset in contig_list:
        if re.search(search_string, contig_str):
            raise SortSeqError("Invalid character for seqtype %s found in %s." % (seqtype, contig_name))

    # Create model object to evaluate on seqs
    if modeltype == "MAT":
        model_obj = Models.LinearModel(model_df)
    elif modeltype == "NBR":
        model_obj = Models.NeighborModel(model_df)

    # Create list of dataframes, one for each contig
    seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
    L = model_obj.length
    sitelist_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
    for contig_str, contig_name, pos_offset in contig_list:
        if len(contig_str) < L:
            continue
        this_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
        num_sites = len(contig_str) - L + 1
        poss = np.arange(num_sites).astype(int)
        this_df["left"] = poss + pos_offset
        this_df["right"] = poss + pos_offset + L - 1
        # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
        this_df[seq_col] = fast.seq2sitelist(contig_str, L)  # Cython
        this_df["ori"] = "+"
        this_df["contig"] = contig_name
        this_df["val"] = model_obj.evaluate(this_df[seq_col])
        sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # If scanning DNA, scan reverse-complement as well
        if seqtype == "dna":
            # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
            this_df[seq_col] = fast.seq2sitelist(contig_str, L, rc=True)  # Cython
            this_df["ori"] = "-"
            this_df["val"] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # Sort by value and reindex
        sitelist_df.sort_values(by="val", ascending=False, inplace=True)
        sitelist_df.reset_index(drop=True, inplace=True)

        # Crop list at numsites
        if sitelist_df.shape[0] > numsites:
            sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

        if verbose:
            print ".",
            sys.stdout.flush()

    if verbose:
        print ""
        sys.stdout.flush()

    # If no sites were found, raise error
    if sitelist_df.shape[0] == 0:
        raise SortSeqError("No full-length sites found within provided contigs.")

    sitelist_df = qc.validate_sitelist(sitelist_df, fix=True)
    return sitelist_df
示例#7
0
x = fast.seq2sitelist(seq,site_length, rc=True, safe=False)
c_time = time.time()-t
print 'cython, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\
    (c_time,len(x))

print '%.1f-fold speedup.'%(p_time/c_time)

print '-----------------------------'
# Test seqs2array_for_matmodel

sites = fast.seq2sitelist(seq,20)
site_length = len(sites[0])
num_sites = len(sites)

t = time.time()
seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype='MAT')
sitearray_p = np.zeros([num_sites,(site_length*len(seq_dict))],dtype=int)
for i, site in enumerate(sites):
    sitearray_p[i,:] = utils.seq2mat(site,seq_dict).ravel()
p_time = time.time()-t
print 'python, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\
    (p_time,num_sites,seqtype,site_length)

t = time.time()
sitearray = fast.seqs2array_for_matmodel(sites,seqtype,safe=False)
c_time = time.time()-t
print 'cython, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\
    (c_time,num_sites,seqtype,site_length)

print '%.1f-fold speedup.'%(p_time/c_time)
示例#8
0
def main(wtseq=None, mutrate=0.10, numseq=10000,dicttype='dna',probarr=None,
        tags=False,tag_length=10):
    
    #generate sequence dictionary
    seq_dict,inv_dict = utils.choose_dict(dicttype)    
                
    mutrate = float(mutrate)
    if (mutrate < 0.0) or (mutrate > 1.0):
        raise SortSeqError('Invalid mutrate==%f'%mutrate)

    numseq = int(numseq)
    if (numseq <= 0):
        raise SortSeqError('numseq must be positive. Is %d'%numseq)

    tag_length = int(tag_length)
    if (tag_length <= 0):
        raise SortSeqErorr('tag_length must be positive. Is %d'%tag_length)

    if isinstance(probarr,np.ndarray):
        L = probarr.shape[1]
        #Generate bases according to provided probability matrix
        letarr = np.zeros([numseq,L])
        for z in range(L):
            letarr[:,z] = np.random.choice(
                range(len(seq_dict)),numseq,p=probarr[:,z]) 
    else:
        parr = []
        wtseq = wtseq.upper()
        L = len(wtseq)
        letarr = np.zeros([numseq,L])
        #Check to make sure the wtseq uses the correct bases.
        lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
        def check_sequences(s):
            return set(s).issubset(lin_seq_dict)
        if not check_sequences(wtseq):
            raise SortSeqError(
                'wtseq can only contain bases in ' + str(lin_seq_dict.keys()))        
        #find wtseq array 
        wtarr = seq2arr(wtseq,seq_dict)
        mrate = mutrate/(len(seq_dict)-1) #prob of non wildtype
        #Generate sequences by mutating away from wildtype
        '''probabilities away from wildtype (0 = stays the same, a 3 for 
            example means a C becomes an A, a 1 means C-> G)'''
        parr = np.array(
            [1-(len(seq_dict)-1)*mrate] 
            + [mrate for i in range(len(seq_dict)-1)])  
        #Generate random movements from wtseq
        letarr = np.random.choice(
            range(len(seq_dict)),[numseq,len(wtseq)],p=parr) 
        #Find sequences
        letarr = np.mod(letarr + wtarr,len(seq_dict))
    seqs= []
    #Convert Back to letters
    for i in range(numseq):
        seqs.append(arr2seq(letarr[i,:],inv_dict)) 

    seq_col = qc.seqtype_to_seqcolname_dict[dicttype]
    seqs_df = pd.DataFrame(seqs, columns=[seq_col])

    # If simulating tags, each generated seq gets a unique tag
    if tags:
        tag_seq_dict,tag_inv_dict = utils.choose_dict('dna')
        tag_alphabet_list = tag_seq_dict.keys()

        # Make sure tag_length is long enough for the number of tags needed
        if len(tag_alphabet_list)**tag_length < 2*numseq:
            raise SortSeqError(\
                'tag_length=%d is too short for num_tags_needed=%d'%\
                (tag_length,numseq))

        # Generate a unique tag for each unique sequence
        tag_set = set([])
        while len(tag_set) < numseq:
            num_tags_left = numseq - len(tag_set)
            new_tags = [''.join(choice(tag_alphabet_list,size=tag_length)) \
                for i in range(num_tags_left)]
            tag_set = tag_set.union(new_tags)

        df = seqs_df.copy()
        df.loc[:,'ct'] = 1
        df.loc[:,'tag'] = list(tag_set)

    # If not simulating tags, list only unique seqs w/ corresponding counts
    else:
        seqs_counts = seqs_df[seq_col].value_counts()
        df = seqs_counts.reset_index()
        df.columns = [seq_col,'ct']

    # Convert into valid dataset dataframe and return
    return qc.validate_dataset(df,fix=True)
def main(data_df,
         model_df,
         start=0,
         end=None,
         err=False,
         coarse_graining_level=0,
         rsquared=False,
         return_freg=False):

    #determine whether you are working with RNA, DNA, or protein.
    #this also should determine modeltype (MAT, NBR, PAIR).
    dicttype, modeltype = qc.get_model_type(model_df)

    #get column header for the sequence column.
    seq_cols = qc.get_cols_from_df(data_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))

    #create dictionary that goes from, for example, nucleotide to number and
    #visa versa.
    seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)

    #set name of sequences column based on type of sequence
    type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
    seq_col_name = type_name_dict[dicttype]

    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end - start
    #throw out wrong length sequences.
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = \
            data_df.loc[:,seq_col_name].str.slice(start,end)
        right_length = data_df.loc[:, seq_col_name].apply(len) == (seqL)
        if not right_length.all():
            sys.stderr.write('''Not all sequences are the same length! 
                       Throwing out incorrect sequences!''')
            data_df = data_df.loc[right_length, :]
        data_df = data_df.reset_index(drop=True)

        if modeltype == 'MAT':
            if seqL != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'NBR':
            if seqL != len(model_df.loc[:, 'pos']) + 1:
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'PAIR':
            if int(scipy.misc.comb(seqL, 2)) != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')

    #get column names of the counts columns (excluding total counts 'ct')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
        data_df['ct'] = data_df[col_headers].sum(axis=1)

    #remove empty rows.
    data_df = data_df[data_df.ct != 0]

    #determine sequence length.

    #make a numpy array out of the model data frame
    model_df_headers = [
        'val_' + str(inv_dict[i]) for i in range(len(seq_dict))
    ]
    value = np.array(model_df[model_df_headers])

    #now we evaluate the expression of each sequence according to the model.
    #first convert to matrix representation of sequences
    seq_mat, wtrow = numerics.dataset2mutarray(data_df.copy(), modeltype)
    temp_df = data_df.copy()

    #evaluate energy of each sequence
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(
        value, seq_mat, wtrow)

    #sort based on value
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True, drop=True)

    #freg is a regularized plot which show how sequences are distributed
    #in energy space.
    if return_freg:
        fig, ax = plt.subplots()
        MI, freg = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)
        plt.imshow(freg, interpolation='nearest', aspect='auto')

        plt.savefig(return_freg)
    else:
        MI = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)

    #if we want to calculate error then use bootstrapping.
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(
                len(data_df_for_sub.index) / 2))
            sub_df.reset_index(inplace=True, drop=True)
            sub_MI[i], sub_std = main(sub_df, model_df, err=False)
        Std = np.std(sub_MI) / np.sqrt(2)

    #we can return linfoot corrolation (rsquared) or return MI.
    if rsquared:
        return (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std))
    else:
        return MI, Std
示例#10
0
def main(dataset_df,
         bin=None,
         start=0,
         end=None,
         bins_df=None,
         pseudocounts=1,
         return_profile=False):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each 
        nucleotide/amino acid character at each position. 
    """
    seq_cols = qc.get_cols_from_df(dataset_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = utils.choose_dict(dicttype)
    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    #for each bin we need to find character frequency profile, then sum over all
    #bins to get activity.

    #first make sure we have activities of each bin:
    if not bins_df:
        bins = utils.get_column_headers(dataset_df)
        #in this case no activity was specified so just assume the activity
        #equals bin number
        activity = [float(b.split('_')[-1]) for b in bins]
    else:
        bins = list(bins_df['bins'])
        activity = list(bins_df['activity'])

    #initialize dataframe for total counts in all bins
    output_ct_df = pd.DataFrame()
    #initialize dataframe for running activity calculation
    output_activity_df = pd.DataFrame()

    for i, b in enumerate(bins):
        bin_num = int(b.split('_')[-1])
        # Compute counts
        counts_df = profile_ct.main(dataset_df,
                                    bin=bin_num,
                                    start=start,
                                    end=end)

        # Create columns for profile_freqs table
        ct_cols = utils.get_column_headers(counts_df)
        #add_pseudocounts
        counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts

        #add to all previous bin counts
        #print output_activity_df
        if i == 0:
            output_ct_df = counts_df[ct_cols]
            output_activity_df = counts_df[ct_cols] * activity[i]
        else:
            output_ct_df = output_ct_df + counts_df[ct_cols]
            output_activity_df = output_activity_df + counts_df[
                ct_cols] * activity[i]

    #now normalize by each character at each position, this is the activity
    #profile

    output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols])

    mut_rate = profile_mut.main(dataset_df, bin=bin)
    freq = profile_freq.main(dataset_df, bin=bin)
    freq_cols = [x for x in freq.columns if 'freq_' in x]
    #now normalize by the wt activity
    wtseq = ''.join(mut_rate['wt'])
    wtarr = utils.seq2mat(wtseq, seq_dict)

    wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols]))

    #sum this to get total
    wt_activity2 = wt_activity.sum(axis=1)
    delta_activity = output_activity_df.subtract(pd.Series(wt_activity2),
                                                 axis=0)
    if return_profile:
        #first find mutation rate according to formula in SI text
        profile_delta_activity = mut_rate['mut']*np.sum(
            (1-np.transpose(wtarr))*np.array(\
            freq[freq_cols])*np.array(delta_activity),axis=1)
        #format into dataframe
        output_df = pd.DataFrame()
        output_df['pos'] = range(start,
                                 start + len(profile_delta_activity.index))
        output_df['mut_activity'] = profile_delta_activity
        return output_df
    else:
        #just add pos column and rename counts columns to activity columns
        output_df = pd.DataFrame(delta_activity)
        output_df.insert(0, 'pos',
                         range(start, start + len(delta_activity.index)))
        #reorder columns

        activity_col_dict = {x:'activity_' + x.split('_')[-1] \
            for x in delta_activity.columns if 'ct_' in x}
        output_df = output_df.rename(columns=activity_col_dict)
        return output_df