def test_profile_freq_bincounts(self): """ Test the ability of mpathic.profile_freq to count frequencies """ print '\nIn test_profile_freq_bincounts...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)' % good_bin_num, # If bad or library, then profile_freq.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError, executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_freq.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_bin_%s.txt'%description io.write(df, out_file) io.load_profile_freq(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)' % bad_bin_num, try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_freq_bincounts(self): """ Test the ability of mpathic.profile_freq to count frequencies """ print '\nIn test_profile_freq_bincounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_freq.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_freq.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_bin_%s.txt'%description io.write(df,out_file) io.load_profile_freq(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def main(dataset_df, bin=None, start=0, end=None): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')] freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols] # Compute frequencies from counts freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0) freq_df.columns = freq_cols freq_df['pos'] = counts_df['pos'] # Validate as counts dataframe freq_df = qc.validate_profile_freq(freq_df,fix=True) return freq_df
def test_profile_freq_totalcounts(self): """ Test the ability of mpathic.profile_freq to compute frequencies based on total count values """ print '\nIn test_profile_freq_totalcounts...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_freq.main(io.load_dataset(file_name)) # If good, then profile_freq.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_total_%s.txt'%description io.write(df, out_file) io.load_profile_freq(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_freq.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_freq_totalcounts(self): """ Test the ability of mpathic.profile_freq to compute frequencies based on total count values """ print '\nIn test_profile_freq_totalcounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_freq.main(io.load_dataset(file_name)) # If good, then profile_freq.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_total_%s.txt'%description io.write(df,out_file) io.load_profile_freq(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_freq.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')