예제 #1
0
파일: Models.py 프로젝트: atareen/mpathic
    def evaluate(self,seqs):
        # Check seqs container
        if isinstance(seqs,pd.DataFrame):
            seq_col = qc.get_cols_from_df(seqs,'seqs')[0]
            seqs_to_use = list(seqs[seq_col])
        elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)):
            raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame')
        else:
            seqs_to_use = list(seqs)

        # Check length
        if len(seqs_to_use[0]) != self.length:
            raise SortSeqError(\
                'Energy Matrix Length does not equal Sequence Length')

        # Compute seqmats
        t0 = time.time()

        # fast.seqs2array_for_matmodel expects seqtype to be bytes
        #self.seqtype = str.encode(self.seqtype) -> type bytes
        #print(self.seqtype)
        #self.seqtype = str.encode(self.seqtype)
        #self.seqtype = str(self.seqtype).encode()
        #print('In Models...')
        #print(type(self.seqtype))
        # if not bytes, change to bytes
        if not (isinstance(self.seqtype,bytes)):
            self.seqtype = str(self.seqtype).encode('utf-8')
        #print(type(self.seqtype))

        #print(type(self.seqtype))
        #print(qc.seqtypes)
        #seqs_to_use = list(map(bytes, str(seqs_to_use).encode('UTF-8')))
        #seqs_to_use = list(map(bytes, seqs_to_use))
        #print(seqs_to_use)

        #if (isinstance(seqs_to_use[0], bytes)):
            #print(seqs_to_use[0].decode())
        #    for i in range(len(seqs_to_use)):
        #        seqs_to_use[i] = seqs_to_use[i].decode()

        # change elements to bytes if they're not bytes
        if not (isinstance(seqs_to_use[0],bytes)):
            #print('changing seq to bytes...')
            for i in range(len(seqs_to_use)):
                seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8')

        #print('calling cython:')
        #seqarray = fast.seqs2array_for_matmodel(list(seqs_to_use),self.seqtype)
        seqarray = fast.seqs2array_for_matmodel(seqs_to_use, self.seqtype)
        t1 = time.time()

        # Compute and return values
        vals = self.evaluate_on_seqarray(seqarray)
        t2 = time.time()

        #print 't1-t0 = %.4f, t1-t2 = %.4f'%(t1-t0,t2-t1)
        return vals 
예제 #2
0
파일: numerics.py 프로젝트: atareen/mpathic
def dataset2mutarray_withwtseq(dataset_df, modeltype, wtseq, chunksize=1000):

    # Determine the type of model and set seq2array function appropriately
    if modeltype == 'MAT':
        seqs2array = fast.seqs2array_for_matmodel
    elif modeltype == 'NBR':
        seqs2array = fast.seqs2array_for_nbrmodel
    else:
        raise SortSeqError('Unknown model type: %s' % modeltype)

    # Determine seqtype, etc.
    seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0]
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    wtcol = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute the wt sequence
    wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool)
    numfeatures = len(wtrow)
    # Process dataframe in chunks
    startrow = 0
    endrow = startrow + chunksize - 1
    numrows = dataset_df.shape[0]

    # Fill in mutarray (a lil matrix) chunk by chunk
    mutarray_lil = lil_matrix((numrows, numfeatures), dtype=int)
    matrix_filled = False
    while not matrix_filled:

        if startrow >= numrows:
            matrix_filled = True
            continue
        elif endrow >= numrows:
            endrow = numrows - 1
            matrix_filled = True

        # Compute seqarray
        seqlist = list(dataset_df[seqcol][startrow:(endrow + 1)])
        seqarray = seqs2array(seqlist, seq_type=seqtype)

        # Remove wt entries
        tmp = seqarray.copy()
        tmp[:, wtrow] = 0

        # Store results from this chunk
        mutarray_lil[startrow:(endrow + 1), :] = tmp

        # Increment rows
        startrow = endrow + 1
        endrow = startrow + chunksize - 1

    # Convert to csr matrix
    mutarray_csr = mutarray_lil.tocsr()

    # Return vararray as well as binary representation of wt seq
    return mutarray_csr, wtrow
예제 #3
0
def main(filelist_df, tags_df=None, indir='./', seq_type=None):
    """ Merges datasets listed in the filelist_df dataframe
    """

    # Validate filelist
    qc.validate_filelist(filelist_df)

    # Read datasets into dictionary indexed by bin number
    dataset_df_dict = {}
    for item in filelist_df.iterrows():
        # Autodetect fasta, fastq, or text file based on file extension
        fn = indir + item[1]['file']
        b = item[1]['bin']
        if re.search(fasta_filename_patterns, fn):
            df = io.load_dataset(fn, file_type='fasta', seq_type=seq_type)
        elif re.search(fastq_filename_patterns, fn):
            df = io.load_dataset(fn, file_type='fastq', seq_type=seq_type)
        else:
            df = io.load_dataset(fn, file_type='text', seq_type=seq_type)
        dataset_df_dict[b] = df

    # Merge datasets into one
    out_df = merge_datasets(dataset_df_dict)

    # Add seqs if given tags_df
    if not tags_df is None:
        qc.validate_tagkey(tags_df)
        tag_col = 'tag'

        # Test to make sure all tags in dataset are a subset of tags
        data_tags = set(out_df[tag_col])
        all_tags = set(tags_df[tag_col])
        if not (data_tags <= all_tags):
            sys.stderr.write('Some tags probably could not be identified.')

        # Get name of seq column
        seq_cols = qc.get_cols_from_df(tags_df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Multiple seq columns; exaclty 1 required.')
        seq_col = seq_cols[0]

        # Set tag to be index column of dataframe
        tags_df = tags_df.set_index(tag_col)

        # Add seqs corresponding to each tag
        tags = out_df[tag_col]
        seqs = tags_df[seq_col][tags].values
        if not all([type(x) == str for x in seqs]):
            raise SortSeqError('Some looked-up seqs are not strings.')
        out_df[seq_col] = tags_df[seq_col][tags].values

    qc.validate_dataset(out_df)
    return out_df
예제 #4
0
파일: numerics.py 프로젝트: atareen/mpathic
def dataset2seqarray(dataset_df, modeltype):
    # Determine the type of model and set seq2array function appropriately
    if modeltype == 'MAT':
        seqs2array = fast.seqs2array_for_matmodel
    elif modeltype == 'NBR':
        seqs2array = fast.seqs2array_for_nbrmodel
    else:
        raise SortSeqError('Unknown model type: %s' % modeltype)
    seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0]
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    seqlist = list(dataset_df[seqcol])
    seqarray = seqs2array(seqlist, seq_type=seqtype)
    return seqarray
예제 #5
0
파일: Models.py 프로젝트: atareen/mpathic
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='MAT':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
예제 #6
0
파일: Models.py 프로젝트: atareen/mpathic
    def evaluate(self,seqs):
        # Check seqs container
        if isinstance(seqs,pd.DataFrame):
            seq_col = qc.get_cols_from_df(seqs,'seqs')[0]
            seqs_to_use = list(seqs[seq_col])
        elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)):
            raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame')
        else:
            seqs_to_use = list(seqs)

        # Check length
        if len(seqs_to_use[0]) != self.length:
            raise SortSeqError(\
                'Energy Matrix Length does not equal Sequence Length')

        # Compute seqmats
        t0 = time.time()
        # python 3 fast.c update for string to bytes conversion

        # if not bytes, change to bytes
        if not (isinstance(self.seqtype,bytes)):
            self.seqtype = str(self.seqtype).encode('utf-8')

        # change elements to bytes if they're not bytes
        if not (isinstance(seqs_to_use[0],bytes)):
            #print('changing seq to bytes...')
            for i in range(len(seqs_to_use)):
                seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8')

        seqarray = fast.seqs2array_for_nbrmodel(seqs_to_use,self.seqtype)
        t1 = time.time()

        # Compute and return values
        vals = self.evaluate_on_seqarray(seqarray)
        t2 = time.time()

        return vals 
예제 #7
0
    def __init__(self,
                 data_df,
                 model_df,
                 start=0,
                 end=None,
                 err=False,
                 coarse_graining_level=0,
                 rsquared=False,
                 return_freg=False):

        self.data_df = data_df
        self.model_df = model_df
        self.start = start
        self.end = end
        self.err = err
        self.coarse_graining_level = coarse_graining_level

        self.out_MI = None
        self.out_std = None

        self._input_checks()

        dicttype, modeltype = qc.get_model_type(self.model_df)
        seq_cols = qc.get_cols_from_df(self.data_df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Dataframe has multiple seq cols: %s' %
                               str(seq_cols))
        seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)
        # set name of sequences column based on type of sequence
        type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
        seq_col_name = type_name_dict[dicttype]
        # Cut the sequences based on start and end, and then check if it makes sense
        if (self.start != 0 or self.end):
            self.data_df.loc[:,
                             seq_col_name] = self.data_df.loc[:,
                                                              seq_col_name].str.slice(
                                                                  self.start,
                                                                  self.end)
            if modeltype == 'MAT':
                if len(self.data_df.loc[0, seq_col_name]) != len(
                        self.model_df.loc[:, 'pos']):
                    print('predictive info class: BP lengths: ',
                          len(self.data_df.loc[0, seq_col_name]), " ",
                          len(self.model_df.loc[:, 'pos']))
                    raise SortSeqError(
                        'model length does not match dataset length')
            elif modeltype == 'NBR':
                if len(self.data_df.loc[0, seq_col_name]) != len(
                        self.model_df.loc[:, 'pos']) + 1:
                    raise SortSeqError(
                        'model length does not match dataset length')
        col_headers = utils.get_column_headers(self.data_df)
        if 'ct' not in self.data_df.columns:
            self.data_df['ct'] = data_df[col_headers].sum(axis=1)
            self.data_df = self.data_df[self.data_df.ct != 0]
        if not self.end:
            seqL = len(self.data_df[seq_col_name][0]) - self.start
        else:
            seqL = self.end - self.start
            self.data_df = self.data_df[self.data_df[seq_col_name].apply(len)
                                        == (seqL)]
        # make a numpy array out of the model data frame
        model_df_headers = [
            'val_' + str(inv_dict[i]) for i in range(len(seq_dict))
        ]
        value = np.transpose(np.array(self.model_df[model_df_headers]))
        # now we evaluate the expression of each sequence according to the model.
        seq_mat, wtrow = numerics.dataset2mutarray(self.data_df.copy(),
                                                   modeltype)
        temp_df = self.data_df.copy()
        # AT: what is this line trying to do?
        temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(
            np.array(self.model_df[model_df_headers]), seq_mat, wtrow)
        temp_sorted = temp_df.sort_values(by='val')
        temp_sorted.reset_index(inplace=True, drop=True)
        # we must divide by the total number of counts in each bin for the MI calculator
        # temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)
        if return_freg:
            #fig, ax = plt.subplots()
            MI, freg = EstimateMutualInfoforMImax.alt4(
                temp_sorted,
                coarse_graining_level=coarse_graining_level,
                return_freg=return_freg)
            #plt.imshow(freg, interpolation='nearest', aspect='auto')

            #plt.savefig(return_freg)
        else:
            MI = EstimateMutualInfoforMImax.alt4(
                temp_sorted,
                coarse_graining_level=coarse_graining_level,
                return_freg=return_freg)
        if not self.err:
            Std = np.NaN
        else:
            data_df_for_sub = self.data_df.copy()
            sub_MI = np.zeros(15)
            for i in range(15):
                sub_df = data_df_for_sub.sample(
                    int(len(data_df_for_sub.index) / 2))
                sub_df.reset_index(inplace=True, drop=True)
                sub_MI[i], sub_std = PredictiveInfo(sub_df,
                                                    model_df,
                                                    err=False)
            Std = np.std(sub_MI) / np.sqrt(2)
        if rsquared:
            #return (1 - 2 ** (-2 * MI)), (1 - 2 ** (-2 * Std))
            self.out_MI, self.out_std = (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std))
        else:
            #return MI, Std
            self.out_MI, self.out_std = MI, Std
예제 #8
0
    def __init__(self,
                 df,
                 lm='ER',
                 modeltype='MAT',
                 LS_means_std=None,
                 db=None,
                 iteration=30000,
                 burnin=1000,
                 thin=10,
                 runnum=0,
                 initialize='LS',
                 start=0,
                 end=None,
                 foreground=1,
                 background=0,
                 alpha=0.0,
                 pseudocounts=1,
                 drop_library=False,
                 verbose=False,
                 tm=None):

        # set attributes
        self.df = df
        self.lm = lm
        self.modeltype = modeltype
        self.LS_means_std = LS_means_std
        self.db = db
        self.iteration = iteration
        self.burnin = burnin
        self.thin = thin
        self.runnum = runnum
        self.initialize = initialize
        self.start = start
        self.end = end
        self.foreground = foreground
        self.background = background
        self.alpha = alpha
        self.pseudocounts = pseudocounts
        self.drop_library = drop_library
        self.verbose = verbose
        self.tm = tm

        # output df
        self.output_df = None

        # validate parameters
        self._input_checks()

        # Determine dictionary
        seq_cols = qc.get_cols_from_df(df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Dataframe has multiple seq cols: %s' %
                               str(seq_cols))
        dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

        seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)
        '''Check to make sure the chosen dictionary type correctly describes
             the sequences. An issue with this test is that if you have DNA sequence
             but choose a protein dictionary, you will still pass this test bc A,C,
             G,T are also valid amino acids'''
        # set name of sequences column based on type of sequence
        type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
        seq_col_name = type_name_dict[dicttype]
        lin_seq_dict, lin_inv_dict = utils.choose_dict(dicttype,
                                                       modeltype='MAT')
        # wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
        # wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
        par_seq_dict = {
            v: k
            for v, k in seq_dict.items() if k != (len(seq_dict) - 1)
        }
        # drop any rows with ct = 0
        df = df[df.loc[:, 'ct'] != 0]
        df.reset_index(drop=True, inplace=True)

        # If there are sequences of different lengths, then print error but continue
        if len(set(df[seq_col_name].apply(len))) > 1:
            sys.stderr.write('Lengths of all sequences are not the same!')
        # select target sequence region
        df.loc[:, seq_col_name] = df.loc[:, seq_col_name].str.slice(start, end)
        df = utils.collapse_further(df)
        col_headers = utils.get_column_headers(df)
        # make sure all counts are ints
        df[col_headers] = df[col_headers].astype(int)
        # create vector of column names
        val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        df.reset_index(inplace=True, drop=True)
        # Drop any sequences with incorrect length
        if not end:
            '''is no value for end of sequence was supplied, assume first seq is
                correct length'''
            seqL = len(df[seq_col_name][0]) - start
        else:
            seqL = end - start
        df = df[df[seq_col_name].apply(len) == (seqL)]
        df.reset_index(inplace=True, drop=True)
        # Do something different for each type of learning method (lm)
        if lm == 'ER':
            if modeltype == 'NBR':
                emat = self.Markov(df,
                                   dicttype,
                                   foreground=foreground,
                                   background=background,
                                   pseudocounts=pseudocounts)
            else:
                emat = self.Berg_von_Hippel(df,
                                            dicttype,
                                            foreground=foreground,
                                            background=background,
                                            pseudocounts=pseudocounts)

        if lm == 'PR':
            emat = self.convex_opt(df, seq_dict, inv_dict, col_headers, tm=tm, \
                                   dicttype=dicttype, modeltype=modeltype)
        if lm == 'LS':
            '''First check that is we don't have a penalty for ridge regression,
                that we at least have all possible base values so that the analysis
                will not fail'''
            if LS_means_std:  # If user supplied preset means and std for each bin
                means_std_df = io.load_meanstd(LS_means_std)

                # change bin number to 'ct_number' and then use as index
                labels = list(means_std_df['bin'].apply(self.add_label))
                std = means_std_df['std']
                std.index = labels
                # Change Weighting of each sequence by dividing counts by bin std
                df[labels] = df[labels].div(std)
                means = means_std_df['mean']
                means.index = labels
            else:
                means = None
            # drop all rows without counts
            df['ct'] = df[col_headers].sum(axis=1)
            df = df[df.ct != 0]
            df.reset_index(inplace=True, drop=True)
            ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
                expression even though it is will be calculated as such if we proceed.
                Therefore is drop_library is passed, drop this column from analysis.'''
            if drop_library:
                try:
                    df.drop('ct_0', inplace=True)
                    col_headers = utils.get_column_headers(df)
                    if len(col_headers) < 2:
                        raise SortSeqError(
                            '''After dropping library there are no longer enough 
                            columns to run the analysis''')
                except:
                    raise SortSeqError(
                        '''drop_library option was passed, but no ct_0
                        column exists''')
            # parameterize sequences into 3xL vectors
            print('init learn model: \n')
            print(par_seq_dict)
            print('dict: ', dicttype)
            raveledmat, batch, sw = utils.genweightandmat(df,
                                                          par_seq_dict,
                                                          dicttype,
                                                          means=means,
                                                          modeltype=modeltype)
            # Use ridge regression to find matrix.
            emat = self.Compute_Least_Squares(raveledmat,
                                              batch,
                                              sw,
                                              alpha=alpha)

        if lm == 'IM':
            seq_mat, wtrow = numerics.dataset2mutarray(df.copy(), modeltype)
            # this is also an MCMC routine, do the same as above.
            if initialize == 'rand':
                if modeltype == 'MAT':
                    emat_0 = utils.RandEmat(len(df[seq_col_name][0]),
                                            len(seq_dict))
                elif modeltype == 'NBR':
                    emat_0 = utils.RandEmat(
                        len(df[seq_col_name][0]) - 1, len(seq_dict))
            elif initialize == 'LS':

                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='LS',
                                       modeltype=modeltype,
                                       alpha=alpha,
                                       start=0,
                                       end=None,
                                       verbose=verbose).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
                # pymc doesn't take sparse mat
            elif initialize == 'PR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='PR',
                                       modeltype=modeltype,
                                       start=0,
                                       end=None).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
            emat = self.MaximizeMI_memsaver(seq_mat,
                                            df.copy(),
                                            emat_0,
                                            wtrow,
                                            db=db,
                                            iteration=iteration,
                                            burnin=burnin,
                                            thin=thin,
                                            runnum=runnum,
                                            verbose=verbose)

        # We have infered out matrix.
        # now format the energy matrices to get them ready to output
        if (lm == 'IM' or lm == 'memsaver'):
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)

        elif lm == 'ER':
            '''the emat for this format is currently transposed compared to other formats
            it is also already a data frame with columns [pos,val_...]'''
            if modeltype == 'NBR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
            else:
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
                try:
                    emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = emat_typical

        elif (lm == 'MK'):
            '''The model is a first order markov model and its gauge does not need
                to be changed.'''

        elif lm == 'PR':
            emat_typical = np.transpose(emat)
        else:  # must be Least squares
            emat_typical = utils.emat_typical_parameterization(
                emat, len(seq_dict))
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(
                        np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
        em = pd.DataFrame(emat_typical)
        em.columns = val_cols
        # add position column
        if modeltype == 'NBR':
            pos = pd.Series(range(start, start - 1 + len(df[seq_col_name][0])),
                            name='pos')
        else:
            pos = pd.Series(range(start, start + len(df[seq_col_name][0])),
                            name='pos')
        output_df = pd.concat([pos, em], axis=1)
        # Validate model and return
        output_df = qc.validate_model(output_df, fix=True)
        self.output_df = output_df