示例#1
0
    def _input_checks(self):

        # data_df validation
        if self.data_df is None:
            raise ControlledError(
                " The Predictive Info class requires pandas dataframe as input dataframe. Entered data_df was 'None'."
            )

        elif self.data_df is not None:
            check(
                isinstance(self.data_df, pd.DataFrame),
                'type(data_df) = %s; must be a pandas dataframe ' %
                type(self.data_df))

        # validate data_df
        check(
            pd.DataFrame.equals(self.data_df,
                                qc.validate_dataset(self.data_df)),
            " Input dataframe fails quality control, please ensure input dataframe has the correct format of an mpathic dataframe "
        )

        # model validation
        if self.model_df is None:
            raise ControlledError(
                " The Predictive info class requires pandas dataframe as input model dataframe. Entered model_df was 'None'."
            )

        elif self.model_df is not None:
            check(
                isinstance(self.model_df, pd.DataFrame),
                'type(model_df) = %s; must be a pandas dataframe ' %
                type(self.model_df))

        # validate model df
        check(
            pd.DataFrame.equals(self.model_df,
                                qc.validate_model(self.model_df)),
            " Model dataframe failed quality control, \
                                please ensure input model dataframe has the correct format of an mpathic dataframe "
        )

        # check that start is an integer
        check(isinstance(self.start, int),
              'type(start) = %s; must be of type int ' % type(self.start))

        check(self.start >= 0,
              "start = %d must be a positive integer " % self.start)

        if self.end is not None:
            check(isinstance(self.end, int),
                  'type(end) = %s; must be of type int ' % type(self.end))

        # check that verbose is a boolean
        check(isinstance(self.err, bool),
              'type(err) = %s; must be of type bool ' % type(self.err))

        check(
            isinstance(self.coarse_graining_level, int),
            'type(coarse_graining_level) = %s; must be of type int ' %
            type(self.coarse_graining_level))
示例#2
0
文件: Models.py 项目: atareen/mpathic
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='MAT':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
示例#3
0
    def _input_check(self):
        """
        private method that validates all parameters
        """

        # check that input df is of type pandas dataframe
        if self.df is None:
            raise ControlledError(
                " Simulate Sort Requires pandas dataframe as input dataframe. Entered df was 'None'."
            )
        elif self.df is not None:
            check(isinstance(self.df, pd.DataFrame),
                  'type(df) = %s; must be a pandas dataframe ' % type(self.df))

            # validate dataset
            check(
                pd.DataFrame.equals(self.df, qc.validate_dataset(self.df)),
                " Input dataframe failed quality control, \
                  please ensure input dataset has the correct format of an mpathic dataframe "
            )

        # check model dataframe
        if self.mp is None:
            raise ControlledError(
                " Simulate Sort Requires pandas dataframe as model input. Entered model df was 'None'."
            )
        elif self.mp is not None:
            check(isinstance(self.mp, pd.DataFrame),
                  'type(mp) = %s; must be a pandas dataframe ' % type(self.mp))

            # validate dataset
            check(
                pd.DataFrame.equals(self.mp, qc.validate_model(self.mp)),
                " Model dataframe failed quality control, \
                  please ensure model has the correct format of an mpathic model dataframe "
            )

        # check noisetype is string
        check(isinstance(self.noisetype, str),
              'type(noisetype) = %s; must be a string ' % type(self.noisetype))

        # check noisetype is valid
        valid_noisetype_values = ['LogNormal', 'Normal', 'None', 'Plasmid']
        check(
            self.noisetype in valid_noisetype_values,
            'noisetype = %s; must be in %s' %
            (self.noisetype, valid_noisetype_values))

        # ensure that npar is type list
        check(isinstance(self.npar, list),
              'type(npar) = %s; must be a list ' % type(self.npar))

        # for valid choice of noisetype, pick appropriate noise parameters
        if self.noisetype == 'Normal':
            if len(self.npar) != 1:
                raise SortSeqError(
                    'For a normal noise model, there must be one input parameter (width of normal distribution)'
                )

        if self.noisetype == 'LogNormal':
            if len(self.npar) != 2:
                raise SortSeqError('''For a LogNormal noise model there must 
                         be 2 input parameters''')

        # ensure nbins is valid
        check(isinstance(self.nbins, int),
              'type(nbins) = %s; must be of type int ' % type(self.nbins))
        check(
            self.nbins > 1,
            'number of bins must be greater than 1, entered bins = %d' %
            self.nbins)

        # sequence library should be boolean
        check(
            isinstance(self.sequence_library, bool),
            'type(sequence_library) = %s; must be of type bool ' %
            type(self.sequence_library))

        # make sure start is of type int
        check(isinstance(self.start, int),
              'type(start) = %s; must be of type int ' % type(self.start))

        # make sure end is of type int
        if self.end is not None:
            check(isinstance(self.end, int),
                  'type(end) = %s; must be of type int ' % type(self.end))

        # make sure end is of type int
        if self.chunksize is not None:
            check(
                isinstance(self.chunksize,
                           int), 'type(chunksize) = %s; must be of type int ' %
                type(self.chunksize))
示例#4
0
    def __init__(self, model_df, contig_list, numsites=10, verbose=False):

        self.sitelist_df = None
        # Determine type of string from model
        qc.validate_model(model_df)
        seqtype, modeltype = qc.get_model_type(model_df)
        seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype)

        # Check that all characters are from the correct alphabet
        alphabet = qc.seqtype_to_alphabet_dict[seqtype]
        search_string = r"[^%s]" % alphabet
        for contig_str, contig_name, pos_offset in contig_list:
            if re.search(search_string, contig_str):
                raise SortSeqError( \
                    'Invalid character for seqtype %s found in %s.' % \
                    (seqtype, contig_name))

        # Create model object to evaluate on seqs
        if modeltype == 'MAT':
            model_obj = Models.LinearModel(model_df)
        elif modeltype == 'NBR':
            model_obj = Models.NeighborModel(model_df)

        # Create list of dataframes, one for each contig
        seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
        L = model_obj.length
        sitelist_df = pd.DataFrame( \
            columns=['val', seq_col, 'left', 'right', 'ori', 'contig'])
        for contig_str, contig_name, pos_offset in contig_list:
            if len(contig_str) < L:
                continue
            this_df = pd.DataFrame( \
                columns=['val', seq_col, 'left', 'right', 'ori', 'contig'])
            num_sites = len(contig_str) - L + 1
            poss = np.arange(num_sites).astype(int)
            this_df['left'] = poss + pos_offset
            this_df['right'] = poss + pos_offset + L - 1
            # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
            this_df[seq_col] = fast.seq2sitelist(contig_str, L)  # Cython
            this_df['ori'] = '+'
            this_df['contig'] = contig_name
            this_df['val'] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

            # If scanning DNA, scan reverse-complement as well
            if seqtype == 'dna':
                # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
                this_df[seq_col] = fast.seq2sitelist(contig_str, L,
                                                     rc=True)  # Cython
                this_df['ori'] = '-'
                this_df['val'] = model_obj.evaluate(this_df[seq_col])
                sitelist_df = pd.concat([sitelist_df, this_df],
                                        ignore_index=True)

            # Sort by value and reindex
            sitelist_df.sort_values(by='val', ascending=False, inplace=True)
            sitelist_df.reset_index(drop=True, inplace=True)

            # Crop list at numsites
            if sitelist_df.shape[0] > numsites:
                sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

            if verbose:
                print('.', sys.stdout.flush())

        if verbose:
            print('')
            sys.stdout.flush()

        # If no sites were found, raise error
        if sitelist_df.shape[0] == 0:
            raise SortSeqError( \
                'No full-length sites found within provided contigs.')

        sitelist_df = qc.validate_sitelist(sitelist_df, fix=True)
        #return sitelist_df
        self.sitelist_df = sitelist_df
示例#5
0
    def __init__(self,
                 df,
                 lm='ER',
                 modeltype='MAT',
                 LS_means_std=None,
                 db=None,
                 iteration=30000,
                 burnin=1000,
                 thin=10,
                 runnum=0,
                 initialize='LS',
                 start=0,
                 end=None,
                 foreground=1,
                 background=0,
                 alpha=0.0,
                 pseudocounts=1,
                 drop_library=False,
                 verbose=False,
                 tm=None):

        # set attributes
        self.df = df
        self.lm = lm
        self.modeltype = modeltype
        self.LS_means_std = LS_means_std
        self.db = db
        self.iteration = iteration
        self.burnin = burnin
        self.thin = thin
        self.runnum = runnum
        self.initialize = initialize
        self.start = start
        self.end = end
        self.foreground = foreground
        self.background = background
        self.alpha = alpha
        self.pseudocounts = pseudocounts
        self.drop_library = drop_library
        self.verbose = verbose
        self.tm = tm

        # output df
        self.output_df = None

        # validate parameters
        self._input_checks()

        # Determine dictionary
        seq_cols = qc.get_cols_from_df(df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Dataframe has multiple seq cols: %s' %
                               str(seq_cols))
        dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

        seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)
        '''Check to make sure the chosen dictionary type correctly describes
             the sequences. An issue with this test is that if you have DNA sequence
             but choose a protein dictionary, you will still pass this test bc A,C,
             G,T are also valid amino acids'''
        # set name of sequences column based on type of sequence
        type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
        seq_col_name = type_name_dict[dicttype]
        lin_seq_dict, lin_inv_dict = utils.choose_dict(dicttype,
                                                       modeltype='MAT')
        # wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
        # wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
        par_seq_dict = {
            v: k
            for v, k in seq_dict.items() if k != (len(seq_dict) - 1)
        }
        # drop any rows with ct = 0
        df = df[df.loc[:, 'ct'] != 0]
        df.reset_index(drop=True, inplace=True)

        # If there are sequences of different lengths, then print error but continue
        if len(set(df[seq_col_name].apply(len))) > 1:
            sys.stderr.write('Lengths of all sequences are not the same!')
        # select target sequence region
        df.loc[:, seq_col_name] = df.loc[:, seq_col_name].str.slice(start, end)
        df = utils.collapse_further(df)
        col_headers = utils.get_column_headers(df)
        # make sure all counts are ints
        df[col_headers] = df[col_headers].astype(int)
        # create vector of column names
        val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        df.reset_index(inplace=True, drop=True)
        # Drop any sequences with incorrect length
        if not end:
            '''is no value for end of sequence was supplied, assume first seq is
                correct length'''
            seqL = len(df[seq_col_name][0]) - start
        else:
            seqL = end - start
        df = df[df[seq_col_name].apply(len) == (seqL)]
        df.reset_index(inplace=True, drop=True)
        # Do something different for each type of learning method (lm)
        if lm == 'ER':
            if modeltype == 'NBR':
                emat = self.Markov(df,
                                   dicttype,
                                   foreground=foreground,
                                   background=background,
                                   pseudocounts=pseudocounts)
            else:
                emat = self.Berg_von_Hippel(df,
                                            dicttype,
                                            foreground=foreground,
                                            background=background,
                                            pseudocounts=pseudocounts)

        if lm == 'PR':
            emat = self.convex_opt(df, seq_dict, inv_dict, col_headers, tm=tm, \
                                   dicttype=dicttype, modeltype=modeltype)
        if lm == 'LS':
            '''First check that is we don't have a penalty for ridge regression,
                that we at least have all possible base values so that the analysis
                will not fail'''
            if LS_means_std:  # If user supplied preset means and std for each bin
                means_std_df = io.load_meanstd(LS_means_std)

                # change bin number to 'ct_number' and then use as index
                labels = list(means_std_df['bin'].apply(self.add_label))
                std = means_std_df['std']
                std.index = labels
                # Change Weighting of each sequence by dividing counts by bin std
                df[labels] = df[labels].div(std)
                means = means_std_df['mean']
                means.index = labels
            else:
                means = None
            # drop all rows without counts
            df['ct'] = df[col_headers].sum(axis=1)
            df = df[df.ct != 0]
            df.reset_index(inplace=True, drop=True)
            ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
                expression even though it is will be calculated as such if we proceed.
                Therefore is drop_library is passed, drop this column from analysis.'''
            if drop_library:
                try:
                    df.drop('ct_0', inplace=True)
                    col_headers = utils.get_column_headers(df)
                    if len(col_headers) < 2:
                        raise SortSeqError(
                            '''After dropping library there are no longer enough 
                            columns to run the analysis''')
                except:
                    raise SortSeqError(
                        '''drop_library option was passed, but no ct_0
                        column exists''')
            # parameterize sequences into 3xL vectors
            print('init learn model: \n')
            print(par_seq_dict)
            print('dict: ', dicttype)
            raveledmat, batch, sw = utils.genweightandmat(df,
                                                          par_seq_dict,
                                                          dicttype,
                                                          means=means,
                                                          modeltype=modeltype)
            # Use ridge regression to find matrix.
            emat = self.Compute_Least_Squares(raveledmat,
                                              batch,
                                              sw,
                                              alpha=alpha)

        if lm == 'IM':
            seq_mat, wtrow = numerics.dataset2mutarray(df.copy(), modeltype)
            # this is also an MCMC routine, do the same as above.
            if initialize == 'rand':
                if modeltype == 'MAT':
                    emat_0 = utils.RandEmat(len(df[seq_col_name][0]),
                                            len(seq_dict))
                elif modeltype == 'NBR':
                    emat_0 = utils.RandEmat(
                        len(df[seq_col_name][0]) - 1, len(seq_dict))
            elif initialize == 'LS':

                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='LS',
                                       modeltype=modeltype,
                                       alpha=alpha,
                                       start=0,
                                       end=None,
                                       verbose=verbose).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
                # pymc doesn't take sparse mat
            elif initialize == 'PR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='PR',
                                       modeltype=modeltype,
                                       start=0,
                                       end=None).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
            emat = self.MaximizeMI_memsaver(seq_mat,
                                            df.copy(),
                                            emat_0,
                                            wtrow,
                                            db=db,
                                            iteration=iteration,
                                            burnin=burnin,
                                            thin=thin,
                                            runnum=runnum,
                                            verbose=verbose)

        # We have infered out matrix.
        # now format the energy matrices to get them ready to output
        if (lm == 'IM' or lm == 'memsaver'):
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)

        elif lm == 'ER':
            '''the emat for this format is currently transposed compared to other formats
            it is also already a data frame with columns [pos,val_...]'''
            if modeltype == 'NBR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
            else:
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
                try:
                    emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = emat_typical

        elif (lm == 'MK'):
            '''The model is a first order markov model and its gauge does not need
                to be changed.'''

        elif lm == 'PR':
            emat_typical = np.transpose(emat)
        else:  # must be Least squares
            emat_typical = utils.emat_typical_parameterization(
                emat, len(seq_dict))
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(
                        np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
        em = pd.DataFrame(emat_typical)
        em.columns = val_cols
        # add position column
        if modeltype == 'NBR':
            pos = pd.Series(range(start, start - 1 + len(df[seq_col_name][0])),
                            name='pos')
        else:
            pos = pd.Series(range(start, start + len(df[seq_col_name][0])),
                            name='pos')
        output_df = pd.concat([pos, em], axis=1)
        # Validate model and return
        output_df = qc.validate_model(output_df, fix=True)
        self.output_df = output_df