def test_profile_info(self): """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir + 'dataset_*.txt') for err in [True, False]: for file_name in file_names: print '\t%s, err=%s =' % (file_name, str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df, out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_info(self): """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir+'dataset_*.txt') for err in [True,False]: for file_name in file_names: print '\t%s, err=%s ='%(file_name,str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df,out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def main(dataset_df, err=False, method="naive", pseudocount=1.0, start=0, end=None): """ Computes the mutual information (in bits), at each position, between the character and the bin number. Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position method (str): Which method to use to estimate mutual information Returns: info_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Get number of bins bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "ct_")] if not len(bin_cols) >= 2: raise SortSeqError("Information profile requires at least 2 bins.") bins = [int(c.split("_")[1]) for c in bin_cols] num_bins = len(bins) # Get number of characters seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "seqs")] if not len(seq_cols) == 1: raise SortSeqError("Must be only one seq column.") seq_col = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[seq_col] alphabet = qc.seqtype_to_alphabet_dict[seqtype] ct_cols = ["ct_" + a for a in alphabet] num_chars = len(alphabet) # Get sequence length and check start, end numbers num_pos = len(dataset_df[seq_col][0]) if not (0 <= start < num_pos): raise SortSeqError("Invalid start==%d, num_pos==%d" % (start, num_pos)) if end is None: end = num_pos elif end > num_pos: raise SortSeqError("Invalid end==%d, num_pos==%d" % (end, num_pos)) elif end <= start: raise SortSeqError("Invalid: start==%d >= end==%d" % (start, end)) # Record positions in new dataframe counts_df = profile_ct.main(dataset_df) info_df = counts_df.loc[start : (end - 1), ["pos"]].copy() # rows from start:end info_df["info"] = 0.0 if err: info_df["info_err"] = 0.0 # Fill in 3D array of counts ct_3d_array = np.zeros([end - start, num_chars, num_bins]) for i, bin_num in enumerate(bins): # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num) # Fill in counts table ct_3d_array[:, :, i] = counts_df.loc[start : (end - 1), ct_cols].astype(float) # Compute mutual information for each position for i in range(end - start): # i only from start:end # Get 2D counts nxy = ct_3d_array[i, :, :] assert len(nxy.shape) == 2 # Compute mutual informaiton if err: mi, mi_err = info.estimate_mutualinfo(nxy, err=True, method=method, pseudocount=pseudocount) info_df.loc[i + start, "info"] = mi info_df.loc[i + start, "info_err"] = mi_err else: mi = info.estimate_mutualinfo(nxy, err=False, method=method, pseudocount=pseudocount) info_df.loc[i + start, "info"] = mi # Validate info dataframe info_df = qc.validate_profile_info(info_df, fix=True) return info_df