def _validate_pos_cols(df, fix=False): """ Validates the pos column in a given dataframe (if it exists) """ col = 'pos' if col in df.columns: try: int_vals = df[col].values.astype(int) float_vals = df[col].values.astype(float) except: raise SortSeqError(\ 'Cannot convert values in column %s to numbers.'%col) if not df[col].values.dtype == int: if all(int_vals == float_vals): if fix: df[col] = df[col].astype(int) else: raise SortSeqError(\ 'Positions are not integers; set fix=True to fix.') else: raise SortSeqError(\ 'Positions cannot be interpreted as integers.') first = df[col].iloc[0] last = df[col].iloc[-1] if not np.array_equal(df[col].values, np.arange(first, last + 1)): raise SortSeqError('Positions are not consecutive integers.') if first < 0: raise SortSeqError('Positions are not all nonnegative.') return df
def _validate_mut_cols(df, fix=False): """ Validates contents of mut and mut_err columns in a given dataframe """ mut_cols = get_cols_from_df(df, 'mut') for col in mut_cols: # Verify that freqs are floats if not df[col].values.dtype == float: # Check whether freqs can be interpreted as floats try: float_vals = df[col].astype(float) except: raise SortSeqError('Non-numbers found in freqs.') # Check whether we have permission to change these to floats if fix: df[col] = float_vals else: raise SortSeqError(\ 'Freqs are not floats; set fix=True to fix.') # Make sure that all mut values are between 0 and 1 if (not all(df[col] <= 1.0)) or (not all(df[col] >= 0.0)): raise SortSeqError(\ 'Freq values outside [0.0, 1.0] encountered.') return df
def validate_file_for_reading(file_arg): """ Checks that a specified file exists and is readable. Returns a valid file handle given a file name or handle """ # If user passed file name if type(file_arg)==str: # Verify that file exists if not os.path.isfile(file_arg): raise SortSeqError('Cannot find file: %s'%file_arg) # Verify that file can be read if not os.access(file_arg,os.R_OK): raise SortSeqError('Can find but cannot read from file: %s'%file_arg) # Get handle to file file_handle = open(file_arg,'r') # If user passed file object elif type(file_arg)==file: # Verify that file isn't closed if file_arg.closed: raise SortSeqError('File object is already closed.') file_handle = file_arg # Otherwise, throw error else: raise SortSeqError('file_arg is neigher a name or handle.') # Return validated file handle return file_handle
def load(file_arg, file_type, **kwargs): """ Loads file of any specified type """ validate_func_dict = { #'dataset' : qc.validate_dataset, # This won't work right now 'model' : qc.validate_model, 'filelist' : qc.validate_filelist, 'tagkey' : qc.validate_tagkey, 'profile_ct' : qc.validate_profile_ct, 'profile_freq' : qc.validate_profile_freq, 'profile_mut' : qc.validate_profile_mut, 'profile_info' : qc.validate_profile_info, 'meanstd' : qc.validate_meanstd, 'sitelist' : qc.validate_sitelist } df = load_text(file_arg) if 'dataset' in file_type: raise SortSeqError('file_type %s is not supported in load()'%file_type) if file_type not in validate_func_dict.keys(): raise SortSeqError('Unrecognized file_type %s'%file_type) func = validate_func_dict[file_type] return func(df,fix=True,**kwargs)
def _validate_freq_cols(df, fix=False, tol=1E-2): """ Validates contents of freq_* columns in a given dataframe """ freq_cols = get_cols_from_df(df, 'freq_') for col in freq_cols: # Check if columns are floats if not df[col].values.dtype == float: # Check whether values can be interpreted as floats try: df.loc[col] = df[col].astype(float) except: raise SortSeqError(\ 'Cannot interpret values in %s as floats.'%col) # Check whether we have permission to change these to floats if fix: df[col] = df[col].astype(float) else: SortSeqError(\ 'Values in %s not floats; set fix=True to fix.'%col) # Make sure that all freqs are between 0 and 1 if (not all(df[col] <= 1.0)) or (not all(df[col] >= 0.0)): raise SortSeqError('Freq values outside [0.0, 1.0] encountered.') # If there are freq cols, sum along each row has to be 1.0 +- tol if freq_cols: row_sums = df[freq_cols].sum(axis=1).values if not all((row_sums <= 1.0 + tol) & (row_sums >= 1.0 - tol)): raise SortSeqError('Not all rows sum to 1.0 +- %f' % tol) return df
def _validate_std_cols(df, fix=False): """ Validates contents of xxx_err columns in a given dataframe """ col = 'std' if col in df.columns: # Verify that freqs are floats if not df[col].values.dtype == float: # Check whether freqs can be interpreted as floats try: float_vals = df[col].astype(float) except: raise SortSeqError('Non-numbers found in %s.' % col) # Check whether we have permission to change these to floats if fix: df[col] = float_vals else: raise SortSeqError(\ 'std values are not floats; set fix=True to fix.') # Make sure that all err values are finite if not all(np.isfinite(df[col])): raise SortSeqError('Nonfinite std values encountered.') # Make sure that all err values are nonnegative if any(df[col] < 0.0): raise SortSeqError('Negative std values encountered.') return df
def validate_dataset(df, fix=False): """ Validates the form of a dataset dataframe. A dataset dataframe must look something like this: ct ct_0 ct_1 ct_2 val tag seq 3 1 2 0 0.012 CTG ACCAT 2 2 0 0 -4.52 CTA ACCAT 1 0 0 1 0.000 CCA TCAGG A 'ct' column reports the total counts of all sequence/tag pairs. Optional 'ct_0', 'ct_1', ... columns contain counts of sequence/tag. pairs for individual bins. Optional 'tag' column lists DNA sequnce tags used to identify sequences. A 'seq' column lists the sequences of interests. Specifications: 0. The dataframe must have at least one row and one column. 1. A 'ct' column is mandatory and should appear first. Counts must be nonnegative integers. If not present, this can be added 2. 'ct_X' columns are optional. If they appear, X must be a nonnegative integer. Columns must appear in the order of this number. Counts must be nonnegative integers and must sum to the value in the 'ct' column. 4. A 'val' column is optional; this reports the value of a model run on the sequences in the dataframe 3. A 'tag', 'seq', 'seq_rna', or 'seq_pro' column is mandatory. More than one of these columns are allowed simultaneously. They must appear to the left of all other columns. In each column, sequences must conform to unambiguous DNA, RNA, or protein alphabets and must be all be of the same length. Arguments: df (pd.DataFrame): Dataset in dataframe format fix (bool): A flag saying whether to fix the dataframe into shape if possible. Returns: if fix=True: df_valid: a valid dataframe that has been fixed by the function if fix=False: Nothing Function: Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True). """ # Verify dataframe has at least one row and one column if not df.shape[0] >= 1: raise SortSeqError('Dataframe must contain at least one row') # Validate column names for col in df.columns: if not is_col_type(col, ['seqs', 'cts', 'tag', 'val']): raise SortSeqError('Invalid column in dataframe: %s' % col) # Validate contents of columns df = _validate_cols(df, fix=fix) # Validate column order ct_cols = get_cols_from_df(df, 'cts') tag_cols = get_cols_from_df(df, 'tag') seq_cols = get_cols_from_df(df, 'seqs') val_cols = get_cols_from_df(df, 'val') new_cols = ct_cols + val_cols + tag_cols + seq_cols if not all(df.columns == new_cols): if fix: df = df[new_cols] else: raise SortSeqError( 'Dataframe columns are in the wrong order; set fix=True to fix.' ) return df
def validate_model(df, fix=False): """ Validates the form of a model dataframe. A model dataframe must look something like this: pos val_A val_C val_G val_T 3 1.1 4.3 -6.19 5.2 4 0.01 3.40 -10.5 5.3 5 0 1.4 10.9 231.0 A 'pos' column reports the position within a sequence to which this modle applies. 'val_X' then describe the values of the model parameters. Specifications: 0. The dataframe must have at least one row and one column. 1. A 'pos' column is mandatory and must occur first. Values must be nonnegative integers in sequential order. 2. 'val_X' columns must conform to one of the accepted model types. These columns must be arranged in alphabetical order. Parameter values must be finite float values. Arguments: df (pd.DataFrame): Dataset in dataframe format fix (bool): A flag saying whether to fix the dataframe into shape if possible. Returns: if fix=True: df_valid: a valid dataframe that has been fixed by the function if fix=False: Nothing Function: Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True). """ # Verify dataframe has at least one row and one column if not df.shape[0] >= 1: raise SortSeqError(\ 'Dataframe must contain at least one row') # Validate column names for col in df.columns: if not is_col_type(col, ['pos', 'vals']): raise SortSeqError('Invalid column in dataframe: %s.' % col) for col in ['pos']: if not col in df.columns: raise SortSeqError('%s column missing' % col) # Validate parameter column names val_cols = sorted([c for c in df.columns if is_col_type(c, 'vals')]) ok = False for cols in model_parameters_dict.values(): # Check if cols and df.columns are identical if len(cols) == len(val_cols): if all([a == b for a, b in zip(cols, val_cols)]): ok = True if not ok: raise SortSeqError( 'Dataframe represents model with invalid columns: %s' % str(val_cols)) # Validate contents of all columns df = _validate_cols(df, fix=fix) return df
def _validate_bin_cols(df, fix=False): """ Validates the bin column in a given dataframe (if it exists) """ col = 'bin' if col in df.columns: try: int_vals = df[col].values.astype(int) float_vals = df[col].values.astype(float) except: raise SortSeqError(\ 'Cannot convert values in column %s to numbers.'%col) if not df[col].values.dtype == int: if all(int_vals == float_vals): if fix: df[col] = df[col].astype(int) else: raise SortSeqError(\ 'Positions are not integers; set fix=True to fix.') else: raise SortSeqError(\ 'Positions cannot be interpreted as integers.') if not len(int_vals) == len(set(int_vals)): raise SortSeqError('Bin numbers are not unique.') if not all(int_vals >= 0): raise SortSeqError('Bin numbers must be nonnegative numbers.') return df
def _validate_mean_cols(df, fix=False): """ Validates contents of mean column in a given dataframe """ col = 'mean' if col in df.columns: # Check if columns are floats if not df[col].values.dtype == float: # Check whether values can be interpreted as floats try: float_vals = df[col].astype(float) except: raise SortSeqError(\ 'Cannot interpret values in %s as floats.'%col) # Check whether we have permission to change these to floats if fix: df[col] = float_vals else: SortSeqError(\ 'Values in %s not floats; set fix=True to fix.'%col) # Make sure that all parameters are finite if not all(np.isfinite(df[col])): pdb.set_trace() raise SortSeqError('Nonfinite parameters encountered.') return df
def validate_tagkey(df, fix=False): """ Validates the form of a tagkeys dataframe. A tagkeys dataframe must look something like this: tag seq AACT ATTAGTCTAGATC AGCT ATTAGTCTAGATC TCGA ATTAGTCTGGGTC A 'tag' column reports the short tag associated with the sequences in the 'seq' column. This file is used in the preprocess method Specifications: 0. The dataframe must have at least one row. 1. A 'tag' column is mandatory and must occur first. Values must be valid DNA sequences, all the same length. 2. A single 'seq', 'seq_rna', or 'seq_pro' column is mandatory and must come second. Values must be valid DNA, RNA, or protein strings, all of the same length. Arguments: df (pd.DataFrame): Dataset in dataframe format fix (bool): A flag saying whether to fix the dataframe into shape if possible. Returns: if fix=True: df_valid: a valid dataframe that has been fixed by the function if fix=False: Nothing Function: Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True). """ # Verify dataframe has at least one row and one column if not df.shape[0] >= 1: raise SortSeqError(\ 'Dataframe must contain at least one row') # Check for exactly one tag column tag_cols = get_cols_from_df(df, 'tag') if len(tag_cols) != 1: raise SortSeqError('Must be exactly one tag column.') # Check for exactly one seqs column seq_cols = get_cols_from_df(df, 'seqs') if len(seq_cols) != 1: raise SortSeqError('Must be exactly one sequence column.') # Validate contents of columns df = _validate_cols(df, fix=fix) # Rearrange columns new_cols = tag_cols + seq_cols if not all(df.columns == new_cols): if fix: df = df[new_cols] else: raise SortSeqError( 'Dataframe columns are in the wrong order; set fix=True to fix.' ) return df
def evaluate(self,seqs): # Check seqs container if isinstance(seqs,pd.DataFrame): seq_col = qc.get_cols_from_df(seqs,'seqs')[0] seqs_to_use = list(seqs[seq_col]) elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)): raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame') else: seqs_to_use = list(seqs) # Check length if len(seqs_to_use[0]) != self.length: raise SortSeqError(\ 'Energy Matrix Length does not equal Sequence Length') # Compute seqmats t0 = time.time() # fast.seqs2array_for_matmodel expects seqtype to be bytes #self.seqtype = str.encode(self.seqtype) -> type bytes #print(self.seqtype) #self.seqtype = str.encode(self.seqtype) #self.seqtype = str(self.seqtype).encode() #print('In Models...') #print(type(self.seqtype)) # if not bytes, change to bytes if not (isinstance(self.seqtype,bytes)): self.seqtype = str(self.seqtype).encode('utf-8') #print(type(self.seqtype)) #print(type(self.seqtype)) #print(qc.seqtypes) #seqs_to_use = list(map(bytes, str(seqs_to_use).encode('UTF-8'))) #seqs_to_use = list(map(bytes, seqs_to_use)) #print(seqs_to_use) #if (isinstance(seqs_to_use[0], bytes)): #print(seqs_to_use[0].decode()) # for i in range(len(seqs_to_use)): # seqs_to_use[i] = seqs_to_use[i].decode() # change elements to bytes if they're not bytes if not (isinstance(seqs_to_use[0],bytes)): #print('changing seq to bytes...') for i in range(len(seqs_to_use)): seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8') #print('calling cython:') #seqarray = fast.seqs2array_for_matmodel(list(seqs_to_use),self.seqtype) seqarray = fast.seqs2array_for_matmodel(seqs_to_use, self.seqtype) t1 = time.time() # Compute and return values vals = self.evaluate_on_seqarray(seqarray) t2 = time.time() #print 't1-t0 = %.4f, t1-t2 = %.4f'%(t1-t0,t2-t1) return vals
def __init__(self,npar): try: self.scale = float(npar[0]) except ValueError: raise SortSeqError('your input parameter must be a float') #Check that scale is in the correct range if self.scale <= 0: raise SortSeqError('''your input scale for normal noise must be greater\ than zero''')
def validate_meanstd(df, fix=False): """ Validates the form of a meanstd dataframe. An meanstd dataframe must look something like this: bin mean std 0 5.1 .9 1 -1.0 1.5 2 -4.2 1 3 8 3 4 3 1 Used only for least squares model fitting. A 'bin' column reports the label of a bin in a. A 'mean' column reports the mean SFR value for sequences in that bin. A 'std' column reports the std of SFR values for sequences in that bin. Specifications: 0. The dataframe must have at least one row. 1. A 'bin' column is mandatory and must occur first. Values must be nonnegative integers in sequential order. 2. A 'mean' column is mandatry and must come second. Values must be finite floatingpoint values. 3. An 'std' column is optional and must come last. Values must be nonnegative floating point values. Arguments: df (pd.DataFrame): Dataset in dataframe format fix (bool): A flag saying whether to fix the dataframe into shape if possible. Returns: df (pd.DataFrame): the fixed dataframe (if fix==True) or the original dataframe passed by the user Function: Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True). """ # Verify dataframe has at least one row if not df.shape[0] >= 1: raise SortSeqError(\ 'Dataframe must contain at least one row') # Validate column names for col in df.columns: if not is_col_type(col, ['bin', 'mean', 'std']): raise SortSeqError('Invalid column in dataframe: %s.' % col) for col in ['bin', 'mean', 'std']: if not col in df.columns: raise SortSeqError('%s column missing' % col) # Validate contents of columns df = _validate_cols(df, fix=fix) # Validate column order new_cols = ['bin', 'mean', 'std'] if not all(df.columns == new_cols): if fix: df = df[new_cols] else: raise SortSeqError(\ 'Dataframe columns are in the wrong order; set fix=True to fix.') return df
def Berg_von_Hippel(self, df, dicttype, foreground=1, background=0, pseudocounts=1): '''Learn models using berg von hippel model. The foreground sequences are usually bin_1 and background in bin_0, this can be changed via flags.''' seq_dict, inv_dict = utils.choose_dict(dicttype) # check that the foreground and background chosen columns actually exist. columns_to_check = {'ct_' + str(foreground), 'ct_' + str(background)} if not columns_to_check.issubset(set(df.columns)): raise SortSeqError( 'Foreground or Background column does not exist!') # get counts of each base at each position foreground_counts = utils.profile_counts(df, dicttype, bin_k=foreground) background_counts = utils.profile_counts(df, dicttype, bin_k=background) binheaders = utils.get_column_headers(foreground_counts) # add pseudocounts to each position foreground_counts[ binheaders] = foreground_counts[binheaders] + pseudocounts background_counts[ binheaders] = background_counts[binheaders] + pseudocounts # make sure there are no zeros in counts after addition of pseudocounts ct_headers = utils.get_column_headers(foreground_counts) if foreground_counts[ct_headers].isin([0]).values.any(): raise SortSeqError( '''There are some bases without any representation in\ the foreground data, you should use pseudocounts to avoid failure \ of the learning method''') if background_counts[ct_headers].isin([0]).values.any(): raise SortSeqError( '''There are some bases without any representation in\ the background data, you should use pseudocounts to avoid failure \ of the learning method''') # normalize to compute frequencies foreground_freqs = foreground_counts.copy() background_freqs = background_counts.copy() foreground_freqs[binheaders] = foreground_freqs[binheaders].div( foreground_freqs[binheaders].sum(axis=1), axis=0) background_freqs[binheaders] = background_freqs[binheaders].div( background_freqs[binheaders].sum(axis=1), axis=0) output_df = -np.log(foreground_freqs / background_freqs) # change column names accordingly (instead of ct_ we want val_) rename_dict = { 'ct_' + str(inv_dict[i]): 'val_' + str(inv_dict[i]) for i in range(len(seq_dict)) } output_df = output_df.rename(columns=rename_dict) return output_df
def _validate_ori_cols(df, fix=False): """ Validates 'ori' columns in a given dataframe. Column must contain only '+' and '-' characters. """ col = 'ori' if col in df.columns: if not all([type(s) == str for s in df[col]]): raise SortSeqError('ori column must contain strings') if not all((df[col] == '+') | (df[col] == '-')): raise SortSeqError(\ 'ori column contains more than just "+" and "-" characters.') return df
def estimate_mutualinfo(raw_counts, pseudocount=1, err=False, method='naive'): """ Naive mutual information esimator. raw_counts must be a 2d array """ # These are the only options supported thus far assert method in ['naive', 'tpm', 'nsb'] # Make sure pseudocount is sane if not pseudocount >= 0: raise SortSeqError('pseudocount is not nonnegative.') # Fix up counts table counts = fix_counts_2d(raw_counts) # Naive estimate; includes pseudocount if method == 'naive': if err: mi, mi_err = _estimate_mutualinfo_naive(counts,\ pseudocount=pseudocount, err=True) else: mi = _estimate_mutualinfo_naive(counts,\ pseudocount=pseudocount, err=False) # Treves, Panzeri, Miller elif method == 'tpm': if err: mi, mi_err = _estimate_mutualinfo_naive(counts,\ pseudocount=pseudocount, err=True) else: mi = _estimate_mutualinfo_naive(counts,\ pseudocount=pseudocount, err=False) # Compute tpm correction n_rows = counts.shape[0] n_cols = counts.shape[1] N = counts.flatten().sum() mi -= (n_cols - 1.0) * (n_rows - 1.0) * np.log2( np.exp(1.0)) / (2.0 * N) # Nemenman, Shafee, Bialek elif method == 'nsb': if err: mi, mi_err = _estimate_mutualinfo_nsb(counts, err=True) else: mi = _estimate_mutualinfo_nsb(counts, err=False) else: raise SortSeqError('Unknown method: %s.' % method) return (mi, mi_err) if err else mi
def _validate_contig_cols(df, fix=False): """ Validates 'contig' columns in a given dataframe. Column must contain strings having no whitespace. """ col = 'contig' if col in df.columns: if not all([type(s) == str for s in df[col]]): raise SortSeqError('contig col contains non-string.') if any([re.search('\s', s) for s in df[col]]): if fix: df.loc[:, col] = [re.sub('\s', '_', s) for s in df[col]] else: raise SortSeqError( 'Whitespace found in contig names; set fix=True to fix.') return df
def is_col_type(col_name, col_types='all'): """ Checks whether col_name is a valid column name, as specified by col_types. col_types can be either a string (for a single column type) or a list of strings (for multimple column types). Default col_types='all' causes function to check all available column types """ col_match = False # Make col_types_list if type(col_types) == list: col_types_list = col_types elif type(col_types) == str: if col_types == 'all': col_types_list = col_patterns.values() else: col_types_list = [col_types] else: raise SortSeqError('col_types is not a string or a list.') # Check for matches wihtin col_type list for col_type in col_types_list: pattern = col_patterns[col_type] if re.search(pattern, col_name): col_match = True # Return true if any match found return col_match
def eval_modelmatrix_on_mutarray(modelmatrix, mutarray, wtrow): print("numerics: sizes: ", modelmatrix.size, " ", wtrow.size) # Do error checking if not isinstance(modelmatrix, np.ndarray): raise SortSeqError('modelmatrix is not a np.ndarray') if not isinstance(wtrow, np.ndarray): raise SortSeqError('wtrow is not an np.ndarray') if not isinstance(mutarray, csr.csr_matrix): raise SortSeqError('mutarray is not a sparse csr_matrix') raise SortSeqError('Unrecognized model type %s' % modeltype) if len(wtrow.shape) != 1: raise SortSeqError('wtrow is not 1-dimensional') if len(modelmatrix.shape) != 2: raise SortSeqError('modelmatrix is not 2-dimensional') if wtrow.size != modelmatrix.size: raise SortSeqError('wtrow does not match modelmatrix') # Compute constant contribution to model prediciton modelmatrix_vec = modelmatrix.ravel() const_val = np.dot(wtrow, modelmatrix_vec) # Prepare matrix for scanning mutarray tmp_matrix = modelmatrix.copy() indices = wtrow.reshape(modelmatrix.shape).astype(bool) wt_matrix_vals = tmp_matrix[indices] tmp_matrix -= wt_matrix_vals[:, np.newaxis] modelmatrix_for_mutarray = csr_matrix(np.matrix(tmp_matrix.ravel()).T) # Compute values mutarray_vals = mutarray * modelmatrix_for_mutarray vals = const_val + mutarray_vals.toarray().ravel() return vals
def rc(dna_str): if re.search(r"[^ACGT]", dna_str): raise SortSeqError('Invalid character found in DNA sequence.') c_str = ''.join([{ 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A' }[B] for B in dna_str]) return c_str[::-1]
def dataset2mutarray_withwtseq(dataset_df, modeltype, wtseq, chunksize=1000): # Determine the type of model and set seq2array function appropriately if modeltype == 'MAT': seqs2array = fast.seqs2array_for_matmodel elif modeltype == 'NBR': seqs2array = fast.seqs2array_for_nbrmodel else: raise SortSeqError('Unknown model type: %s' % modeltype) # Determine seqtype, etc. seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0] seqtype = qc.colname_to_seqtype_dict[seqcol] wtcol = qc.seqtype_to_wtcolname_dict[seqtype] # Compute the wt sequence wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool) numfeatures = len(wtrow) # Process dataframe in chunks startrow = 0 endrow = startrow + chunksize - 1 numrows = dataset_df.shape[0] # Fill in mutarray (a lil matrix) chunk by chunk mutarray_lil = lil_matrix((numrows, numfeatures), dtype=int) matrix_filled = False while not matrix_filled: if startrow >= numrows: matrix_filled = True continue elif endrow >= numrows: endrow = numrows - 1 matrix_filled = True # Compute seqarray seqlist = list(dataset_df[seqcol][startrow:(endrow + 1)]) seqarray = seqs2array(seqlist, seq_type=seqtype) # Remove wt entries tmp = seqarray.copy() tmp[:, wtrow] = 0 # Store results from this chunk mutarray_lil[startrow:(endrow + 1), :] = tmp # Increment rows startrow = endrow + 1 endrow = startrow + chunksize - 1 # Convert to csr matrix mutarray_csr = mutarray_lil.tocsr() # Return vararray as well as binary representation of wt seq return mutarray_csr, wtrow
def fix_counts(raw_counts): """ Flattens and converts to floats. Also checks that elements are present, are nonnegative, and not all zero. """ try: counts = np.array(raw_counts).astype(float).flatten() except: raise SortSeqError('could not covernt counts to array of flots') if len(counts.shape) == 0.0: raise SortSeqError('counts is empty or not array.') if not all(np.isfinite(counts)): raise SortSeqError('counts are not all finite.') if not all(counts >= 0.0): raise SortSeqError('counts are not nonnegative.') if all(counts == 0.0): raise SortSeqError('counts are all equal to zero.') return counts
def test_profile_freq_bincounts(self): """ Test the ability of mpathic.profile_freq to count frequencies """ print '\nIn test_profile_freq_bincounts...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)' % good_bin_num, # If bad or library, then profile_freq.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError, executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_freq.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_bin_%s.txt'%description io.write(df, out_file) io.load_profile_freq(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)' % bad_bin_num, try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise
def fix_counts_2d(raw_counts): """ Converts to numpy array of floats. Checks that array is 2d, elements are nonnegative and not all zero. """ try: counts = np.array(raw_counts).astype(float) except: raise SortSeqError('could not covernt counts to array of flots') if len(counts.shape) != 2: raise SortSeqError('counts array is not 2d.') if not all(np.isfinite(counts.flatten())): raise SortSeqError('counts are all finite.') if not all(counts.flatten() >= 0.0): raise SortSeqError('counts are not nonnegative.') if all(counts.flatten() == 0.0): raise SortSeqError('counts are all equal to zero.') return counts
def _validate_lr_cols(df, fix=False): """ Validates left/right columns in a given dataframe. Will check columns with names 'left' or 'right'. """ lr_cols = get_cols_from_df(df, 'lr') for col in lr_cols: # Verify that counts are integers if not df[col].values.dtype == int: # Try to convert column to numbers try: int_vals = df[col].astype(int) float_vals = df[col].astype(float) except: raise SortSeqError( 'Cannot interptret left/right positions as numbers; column name = %s' % col) # Convert to integers if this doesn't change count values if all(int_vals == float_vals): if fix: df[col] = int_vals else: SortSeqError( 'left/right positions are not integers; set fix=True to fix.' ) else: raise SortSeqError( 'Noninteger numbers found in left/right positions.') # Make sure that all parameters are finite if not all(np.isfinite(df[col])): SortSeqError('Nonfinite left/right positions encountered.') # Verify that counts are nonnegative if not all(df[col] >= 0): raise SortSeqError( 'left/right positions must be nonnegative numbers.') return df
def load_text(file_arg): """ General function used to load data from a text file """ file_handle = validate_file_for_reading(file_arg) try: df = pd.io.parsers.read_csv(file_handle,delim_whitespace=True,\ comment='#', skip_blank_lines=True, engine='c') except: raise SortSeqError(\ 'Could not interpret text file %s as dataframe.'%repr(file_handle)) return df.dropna(axis=0, how='all') # Drop rows with all NaNs
def dataset2seqarray(dataset_df, modeltype): # Determine the type of model and set seq2array function appropriately if modeltype=='MAT': seqs2array = mpathic.fast.seqs2array_for_matmodel elif modeltype=='NBR': seqs2array = mpathic.fast.seqs2array_for_nbrmodel else: raise SortSeqError('Unknown model type: %s'%modeltype) seqcol = qc.get_cols_from_df(dataset_df,'seqs')[0] seqtype = qc.colname_to_seqtype_dict[seqcol] seqlist = list(dataset_df[seqcol]) seqarray = seqs2array(seqlist, seq_type=seqtype) return seqarray
def _validate_seqs_cols(df, fix=False): """ Validates sequence columns in a given dataframe. Will check columns with names seq, seq_rna, seq_pro, tag, wt, wt_rna, wt_pro """ seq_cols = get_cols_from_df(df, ['seqs', 'tag', 'wts']) for col in seq_cols: # Set alphabet try: seqtype = colname_to_seqtype_dict[col] alphabet = seqtype_to_alphabet_dict[seqtype] except: raise SortSeqError('Sequence column is of unkown type: %s.' % col) # Check that all sequences have the same length try: L = len(df[col][0]) except: raise SortSeqError('Could not determine length of sequence.') if not all([len(seq) == L for seq in df[col]]): raise SortSeqError('Not all sequences are the same length.') # Make sure sequences are uppercase if not all([seq == seq.upper() for seq in df[col]]): if fix: df[col] = [seq.upper() for seq in df[col]] else: SortSeqError( 'Seqs are not all uppercase; set fix=True to fix.') # Check that all characters are from the correct alphabet search_string = r"[^%s]" % alphabet if not all([re.search(search_string, seq) == None for seq in df[col]]): print sum( [re.search(search_string, seq) == None for seq in df[col]]) raise SortSeqError('Invalid character found in sequences.') return df
def get_model_type(model_df): """ Returns seqtype correpsonding to given model dataframe """ headers = get_cols_from_df(model_df, 'vals') seqtype = None modeltype = None for key in model_parameters_dict.keys(): val_cols = model_parameters_dict[key] if set(val_cols) == set(headers): seqtype = key[1] modeltype = key[0] if (seqtype is None) or (modeltype is None): raise SortSeqError('Could not identify seqtype or modeltype') return (seqtype, modeltype)