def test_profile_ct_bincounts(self): """ Test the ability of mpathic.profile_ct to count frequencies """ print '\nIn test_profile_ct_bincounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_ct.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_ct.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_bin_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_ct_totalcounts(self): """ Test the ability of mpathic.profile_ct to count frequencies based on total count values """ print '\nIn test_profile_ct_totalcounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_ct.main(io.load_dataset(file_name)) # If good, then profile_ct.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_total_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_ct.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def main(dataset_df, bin=None, start=0, end=None): """ Computes character counts at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: counts_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Retrieve type of sequence seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')] if not len(seq_cols)==1: raise SortSeqError('Dataset dataframe must have only one seq colum.') colname = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[colname] alphabet = qc.seqtype_to_alphabet_dict[seqtype] num_chars = len(alphabet) # Retrieve sequence length if not dataset_df.shape[0] > 1: raise SortSeqError('Dataset dataframe must have at least one row.') total_seq_length = len(dataset_df[colname].iloc[0]) # Validate start and end if start<0: raise SortSeqError('start=%d is negative.'%start) elif start>=total_seq_length: raise SortSeqError('start=%d >= total_seq_length=%d'%\ (start,total_seq_length)) if end is None: end=total_seq_length elif end<=start: raise SortSeqError('end=%d <= start=%d.'%(end,start)) elif end>total_seq_length: raise SortSeqError('end=%d > total_seq_length=%d'%\ (start,total_seq_length)) # Set positions poss = pd.Series(range(start,end),name='pos') num_poss = len(poss) # Retrieve counts if bin is None: ct_col = 'ct' else: ct_col = 'ct_%d'%bin if not ct_col in dataset_df.columns: raise SortSeqError('Column "%s" is not in columns=%s'%\ (ct_col,str(dataset_df.columns))) counts = dataset_df[ct_col] # Compute counts profile counts_array = np.zeros([num_poss,num_chars]) counts_cols = ['ct_'+a for a in alphabet] for i,pos in enumerate(range(start,end)): char_list = dataset_df[colname].str.slice(pos,pos+1) counts_array[i,:] = [np.sum(counts[char_list==a]) for a in alphabet] temp_df = pd.DataFrame(counts_array,columns=counts_cols) counts_df = pd.concat([poss,temp_df],axis=1) # Validate as counts dataframe counts_df = qc.validate_profile_ct(counts_df,fix=True) return counts_df