def main(dataset_df, bin=None, start=0, end=None): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')] freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols] # Compute frequencies from counts freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0) freq_df.columns = freq_cols freq_df['pos'] = counts_df['pos'] # Validate as counts dataframe freq_df = qc.validate_profile_freq(freq_df,fix=True) return freq_df
def main(filelist_df,tags_df=None,indir='./',seq_type=None): """ Merges datasets listed in the filelist_df dataframe """ # Validate filelist qc.validate_filelist(filelist_df) # Read datasets into dictionary indexed by bin number dataset_df_dict = {} for item in filelist_df.iterrows(): # Autodetect fasta, fastq, or text file based on file extension fn = indir+item[1]['file'] b = item[1]['bin'] if re.search(fasta_filename_patterns,fn): df = io.load_dataset(fn,file_type='fasta',seq_type=seq_type) elif re.search(fastq_filename_patterns,fn): df = io.load_dataset(fn,file_type='fastq',seq_type=seq_type) else: df = io.load_dataset(fn,file_type='text',seq_type=seq_type) dataset_df_dict[b] = df # Merge datasets into one out_df = merge_datasets(dataset_df_dict) # Add seqs if given tags_df if not tags_df is None: qc.validate_tagkey(tags_df) tag_col = 'tag' # Test to make sure all tags in dataset are a subset of tags data_tags = set(out_df[tag_col]) all_tags = set(tags_df[tag_col]) if not (data_tags <= all_tags): sys.stderr.write('Some tags probably could not be identified.') # Get name of seq column seq_cols = qc.get_cols_from_df(tags_df, 'seqs') if not len(seq_cols)==1: raise SortSeqError('Multiple seq columns; exaclty 1 required.') seq_col = seq_cols[0] # Set tag to be index column of dataframe tags_df = tags_df.set_index(tag_col) # Add seqs corresponding to each tag tags = out_df[tag_col] seqs = tags_df[seq_col][tags].values if not all([type(x)==str for x in seqs]): raise SortSeqError('Some looked-up seqs are not strings.') out_df[seq_col] = tags_df[seq_col][tags].values qc.validate_dataset(out_df) return out_df
def main(dataset_df, bin=None, start=0, end=None, err=False): """ Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c, "ct_")] # Record positions in new dataframe mut_df = counts_df[["pos"]].copy() # Compute mutation rate across counts max_ct = counts_df[ct_cols].max(axis=1) sum_ct = counts_df[ct_cols].sum(axis=1) mut = 1.0 - (max_ct / sum_ct) mut_df["mut"] = mut # Computation of error rate is optional if err: mut_err = np.sqrt(mut * (1.0 - mut) / sum_ct) mut_df["mut_err"] = mut_err # Figure out which alphabet the cts dataframe specifies alphabet = "".join([c.split("_")[1] for c in ct_cols]) seqtype = qc.alphabet_to_seqtype_dict[alphabet] wt_col = qc.seqtype_to_wtcolname_dict[seqtype] # Compute WT base at each position mut_df[wt_col] = "X" for col in ct_cols: indices = (counts_df[col] == max_ct).values mut_df.loc[indices, wt_col] = col.split("_")[1] # Validate as counts dataframe mut_df = qc.validate_profile_mut(mut_df, fix=True) return mut_df
def test_preprocess(self): """ Test the ability of mpathic.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir + 'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names) > 0) for file_name in file_names: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name), indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir + 'dataset_%s.txt' % description io.write(df, out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def main(dataset_df,model_df,left=None,right=None): # Validate dataframes qc.validate_dataset(dataset_df) qc.validate_model(model_df) # Detect model type based on columns seqtype, modeltype = qc.get_model_type(model_df) seqcol = qc.seqtype_to_seqcolname_dict[seqtype] # Set start and end based on left or right if not ((left is None) or (right is None)): raise SortSeqError('Cannot set both left and right at same time.') if not (left is None): start = left end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0) elif not (right is None): end = right start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0) else: start = model_df['pos'].values[0] end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1) assert start < end # Validate start and end positions seq_length = len(dataset_df[seqcol][0]) if start < 0: raise SortSeqError('Invalid start=%d'%start) if end > seq_length: raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length)) #select target sequence region out_df = dataset_df.copy() out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end) #Create model object of correct type if modeltype == 'MAT': mymodel = Models.LinearModel(model_df) elif modeltype == 'NBR': mymodel = Models.NeighborModel(model_df) else: raise SortSeqError('Unrecognized model type %s'%modeltype) # Compute values out_df['val'] = mymodel.evaluate(out_df) # Validate dataframe and return return qc.validate_dataset(out_df,fix=True)
def wrapper(args): try: npar = args.noiseparam.strip("[").strip("]").split(",") except: npar = [] nbins = args.nbins # Run funciton if args.i: df = pd.io.parsers.read_csv(args.i, delim_whitespace=True, dtype={"seqs": str, "batch": int}) else: df = pd.io.parsers.read_csv(sys.stdin, delim_whitespace=True, dtype={"seqs": str, "batch": int}) if len(utils.get_column_headers(df)) > 0: raise SortSeqError("Library already sorted!") model_df = io.load_model(args.model) output_df = main(df, model_df, args.noisemodel, npar, nbins, start=args.start, end=args.end) if args.out: outloc = open(args.out, "w") else: outloc = sys.stdout pd.set_option("max_colwidth", int(1e8)) # Validate dataframe for writting output_df = qc.validate_dataset(output_df, fix=True) io.write(output_df, outloc)
def wrapper(args): T_LibCounts = args.totallibcounts T_mRNACounts = args.totalmRNAcounts if T_LibCounts <=0 or T_mRNACounts <= 0: raise SortSeqError('Counts must be greater than zero') model_df = io.load_model(args.model) if args.i: df = pd.io.parsers.read_csv(args.i,delim_whitespace=True) else: df = pd.io.parsers.read_csv(sys.stdin,delim_whitespace=True) #make sure the library is not already sorted if len(utils.get_column_headers(df)) > 0: raise SortSeqError('Library already sorted!') header = df.columns libcounts,expcounts = main(df,model_df,T_LibCounts,T_mRNACounts,start=args.start,end=args.end) #add these counts to input dataframe lc = pd.Series(libcounts,name='ct_0') ec = pd.Series(expcounts,name='ct_1') df['ct_0'] = lc df['ct_1'] = ec df['ct'] = df[['ct_0','ct_1']].sum(axis=1) if args.out: outloc = open(args.out,'w') else: outloc = sys.stdout pd.set_option('max_colwidth',int(1e8)) # Validate dataframe for writting df = qc.validate_dataset(df,fix=True) io.write(df,outloc)
def test_preprocess(self): """ Test the ability of mpathic.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir+'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names)>0) for file_name in file_names: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir+'dataset_%s.txt'%description io.write(df,out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def merge_datasets(dataset_df_dict): """ Merges multiple datasets into one. Data from disparate files is merged via values in 'tag', seq', 'seq_rna', or 'seq_pro' columns (in order of preference, chosen according to availability). Each value in the 'ct' column of each dataset is recorded in the 'ct_[bin]' column of the final dataset. A total 'ct' column is then computed, and rows in the final dataset are sorted in descending order according to this. Arguments: dataset_df_dict (dict): Keys are bin numbers, values are dataset dataframes Returns: out_df (pd.DataFrame): A validated dataset dataframe """ # Make sure datasets were loaded if not len(dataset_df_dict)>=1: raise SortSeqError('No datasets were loaded') # Determine index column. Must be same for all files df = dataset_df_dict.values()[0] if 'tag' in df.columns: index_col = 'tag' elif 'seq' in df.columns: index_col = 'seq' elif 'seq_rna' in df.columns: index_col = 'seq_rna' elif 'seq_pro' in df.columns: index_col = 'seq_pro' # Concatenate dataset dataframes out_df = pd.DataFrame() for b in dataset_df_dict.keys(): df = dataset_df_dict[b] # Verify that dataframe has correct column if not index_col in df.columns: raise SortSeqError('\ Dataframe does not contain index_col="%s"'%index_col) if not 'ct' in df.columns: raise SortSeqError('\ Dataframe does not contain a "ct" column') # Delete "ct_X" columns for col in df.columns: if qc.is_col_type(col,'ct_'): del df[col] # Add bin number to name of counts column. df = df.rename(columns={'ct':'ct_%d'%b}) # Index dataset by index_col df = df.groupby(index_col).sum() # Concatenate out_df = pd.concat([out_df,df],axis=1) # Rename index as tag out_df.reset_index(inplace=True) out_df.rename(columns = {'index':index_col},inplace=True) # Fill undefined counts with zero out_df.fillna(value=0,inplace=True) # Add 'ct' column, with proper counts out_df['ct'] = 0 for col in out_df.columns: if qc.is_col_type(col,'ct_'): out_df['ct'] += out_df[col] # Sort by 'ct' column out_df.sort('ct',ascending=False,inplace=True) out_df.reset_index(drop=True,inplace=True) # Validate out_df as dataset and return it out_df = qc.validate_dataset(out_df,fix=True) return out_df
def main(dataset_df, err=False, method="naive", pseudocount=1.0, start=0, end=None): """ Computes the mutual information (in bits), at each position, between the character and the bin number. Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position method (str): Which method to use to estimate mutual information Returns: info_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Get number of bins bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "ct_")] if not len(bin_cols) >= 2: raise SortSeqError("Information profile requires at least 2 bins.") bins = [int(c.split("_")[1]) for c in bin_cols] num_bins = len(bins) # Get number of characters seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "seqs")] if not len(seq_cols) == 1: raise SortSeqError("Must be only one seq column.") seq_col = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[seq_col] alphabet = qc.seqtype_to_alphabet_dict[seqtype] ct_cols = ["ct_" + a for a in alphabet] num_chars = len(alphabet) # Get sequence length and check start, end numbers num_pos = len(dataset_df[seq_col][0]) if not (0 <= start < num_pos): raise SortSeqError("Invalid start==%d, num_pos==%d" % (start, num_pos)) if end is None: end = num_pos elif end > num_pos: raise SortSeqError("Invalid end==%d, num_pos==%d" % (end, num_pos)) elif end <= start: raise SortSeqError("Invalid: start==%d >= end==%d" % (start, end)) # Record positions in new dataframe counts_df = profile_ct.main(dataset_df) info_df = counts_df.loc[start : (end - 1), ["pos"]].copy() # rows from start:end info_df["info"] = 0.0 if err: info_df["info_err"] = 0.0 # Fill in 3D array of counts ct_3d_array = np.zeros([end - start, num_chars, num_bins]) for i, bin_num in enumerate(bins): # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num) # Fill in counts table ct_3d_array[:, :, i] = counts_df.loc[start : (end - 1), ct_cols].astype(float) # Compute mutual information for each position for i in range(end - start): # i only from start:end # Get 2D counts nxy = ct_3d_array[i, :, :] assert len(nxy.shape) == 2 # Compute mutual informaiton if err: mi, mi_err = info.estimate_mutualinfo(nxy, err=True, method=method, pseudocount=pseudocount) info_df.loc[i + start, "info"] = mi info_df.loc[i + start, "info_err"] = mi_err else: mi = info.estimate_mutualinfo(nxy, err=False, method=method, pseudocount=pseudocount) info_df.loc[i + start, "info"] = mi # Validate info dataframe info_df = qc.validate_profile_info(info_df, fix=True) return info_df
def main(wtseq=None, mutrate=0.10, numseq=10000,dicttype='dna',probarr=None, tags=False,tag_length=10): #generate sequence dictionary seq_dict,inv_dict = utils.choose_dict(dicttype) mutrate = float(mutrate) if (mutrate < 0.0) or (mutrate > 1.0): raise SortSeqError('Invalid mutrate==%f'%mutrate) numseq = int(numseq) if (numseq <= 0): raise SortSeqError('numseq must be positive. Is %d'%numseq) tag_length = int(tag_length) if (tag_length <= 0): raise SortSeqErorr('tag_length must be positive. Is %d'%tag_length) if isinstance(probarr,np.ndarray): L = probarr.shape[1] #Generate bases according to provided probability matrix letarr = np.zeros([numseq,L]) for z in range(L): letarr[:,z] = np.random.choice( range(len(seq_dict)),numseq,p=probarr[:,z]) else: parr = [] wtseq = wtseq.upper() L = len(wtseq) letarr = np.zeros([numseq,L]) #Check to make sure the wtseq uses the correct bases. lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT') def check_sequences(s): return set(s).issubset(lin_seq_dict) if not check_sequences(wtseq): raise SortSeqError( 'wtseq can only contain bases in ' + str(lin_seq_dict.keys())) #find wtseq array wtarr = seq2arr(wtseq,seq_dict) mrate = mutrate/(len(seq_dict)-1) #prob of non wildtype #Generate sequences by mutating away from wildtype '''probabilities away from wildtype (0 = stays the same, a 3 for example means a C becomes an A, a 1 means C-> G)''' parr = np.array( [1-(len(seq_dict)-1)*mrate] + [mrate for i in range(len(seq_dict)-1)]) #Generate random movements from wtseq letarr = np.random.choice( range(len(seq_dict)),[numseq,len(wtseq)],p=parr) #Find sequences letarr = np.mod(letarr + wtarr,len(seq_dict)) seqs= [] #Convert Back to letters for i in range(numseq): seqs.append(arr2seq(letarr[i,:],inv_dict)) seq_col = qc.seqtype_to_seqcolname_dict[dicttype] seqs_df = pd.DataFrame(seqs, columns=[seq_col]) # If simulating tags, each generated seq gets a unique tag if tags: tag_seq_dict,tag_inv_dict = utils.choose_dict('dna') tag_alphabet_list = tag_seq_dict.keys() # Make sure tag_length is long enough for the number of tags needed if len(tag_alphabet_list)**tag_length < 2*numseq: raise SortSeqError(\ 'tag_length=%d is too short for num_tags_needed=%d'%\ (tag_length,numseq)) # Generate a unique tag for each unique sequence tag_set = set([]) while len(tag_set) < numseq: num_tags_left = numseq - len(tag_set) new_tags = [''.join(choice(tag_alphabet_list,size=tag_length)) \ for i in range(num_tags_left)] tag_set = tag_set.union(new_tags) df = seqs_df.copy() df.loc[:,'ct'] = 1 df.loc[:,'tag'] = list(tag_set) # If not simulating tags, list only unique seqs w/ corresponding counts else: seqs_counts = seqs_df[seq_col].value_counts() df = seqs_counts.reset_index() df.columns = [seq_col,'ct'] # Convert into valid dataset dataframe and return return qc.validate_dataset(df,fix=True)
def main(dataset_df, bin=None, start=0, end=None, bins_df=None, pseudocounts=1, return_profile=False): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ seq_cols = qc.get_cols_from_df(dataset_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict, inv_dict = utils.choose_dict(dicttype) # Validate dataset_df qc.validate_dataset(dataset_df) #for each bin we need to find character frequency profile, then sum over all #bins to get activity. #first make sure we have activities of each bin: if not bins_df: bins = utils.get_column_headers(dataset_df) #in this case no activity was specified so just assume the activity #equals bin number activity = [float(b.split('_')[-1]) for b in bins] else: bins = list(bins_df['bins']) activity = list(bins_df['activity']) #initialize dataframe for total counts in all bins output_ct_df = pd.DataFrame() #initialize dataframe for running activity calculation output_activity_df = pd.DataFrame() for i, b in enumerate(bins): bin_num = int(b.split('_')[-1]) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num, start=start, end=end) # Create columns for profile_freqs table ct_cols = utils.get_column_headers(counts_df) #add_pseudocounts counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts #add to all previous bin counts #print output_activity_df if i == 0: output_ct_df = counts_df[ct_cols] output_activity_df = counts_df[ct_cols] * activity[i] else: output_ct_df = output_ct_df + counts_df[ct_cols] output_activity_df = output_activity_df + counts_df[ ct_cols] * activity[i] #now normalize by each character at each position, this is the activity #profile output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols]) mut_rate = profile_mut.main(dataset_df, bin=bin) freq = profile_freq.main(dataset_df, bin=bin) freq_cols = [x for x in freq.columns if 'freq_' in x] #now normalize by the wt activity wtseq = ''.join(mut_rate['wt']) wtarr = utils.seq2mat(wtseq, seq_dict) wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols])) #sum this to get total wt_activity2 = wt_activity.sum(axis=1) delta_activity = output_activity_df.subtract(pd.Series(wt_activity2), axis=0) if return_profile: #first find mutation rate according to formula in SI text profile_delta_activity = mut_rate['mut']*np.sum( (1-np.transpose(wtarr))*np.array(\ freq[freq_cols])*np.array(delta_activity),axis=1) #format into dataframe output_df = pd.DataFrame() output_df['pos'] = range(start, start + len(profile_delta_activity.index)) output_df['mut_activity'] = profile_delta_activity return output_df else: #just add pos column and rename counts columns to activity columns output_df = pd.DataFrame(delta_activity) output_df.insert(0, 'pos', range(start, start + len(delta_activity.index))) #reorder columns activity_col_dict = {x:'activity_' + x.split('_')[-1] \ for x in delta_activity.columns if 'ct_' in x} output_df = output_df.rename(columns=activity_col_dict) return output_df
def main(dataset_df, bin=None, start=0, end=None): """ Computes character counts at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: counts_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Retrieve type of sequence seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')] if not len(seq_cols)==1: raise SortSeqError('Dataset dataframe must have only one seq colum.') colname = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[colname] alphabet = qc.seqtype_to_alphabet_dict[seqtype] num_chars = len(alphabet) # Retrieve sequence length if not dataset_df.shape[0] > 1: raise SortSeqError('Dataset dataframe must have at least one row.') total_seq_length = len(dataset_df[colname].iloc[0]) # Validate start and end if start<0: raise SortSeqError('start=%d is negative.'%start) elif start>=total_seq_length: raise SortSeqError('start=%d >= total_seq_length=%d'%\ (start,total_seq_length)) if end is None: end=total_seq_length elif end<=start: raise SortSeqError('end=%d <= start=%d.'%(end,start)) elif end>total_seq_length: raise SortSeqError('end=%d > total_seq_length=%d'%\ (start,total_seq_length)) # Set positions poss = pd.Series(range(start,end),name='pos') num_poss = len(poss) # Retrieve counts if bin is None: ct_col = 'ct' else: ct_col = 'ct_%d'%bin if not ct_col in dataset_df.columns: raise SortSeqError('Column "%s" is not in columns=%s'%\ (ct_col,str(dataset_df.columns))) counts = dataset_df[ct_col] # Compute counts profile counts_array = np.zeros([num_poss,num_chars]) counts_cols = ['ct_'+a for a in alphabet] for i,pos in enumerate(range(start,end)): char_list = dataset_df[colname].str.slice(pos,pos+1) counts_array[i,:] = [np.sum(counts[char_list==a]) for a in alphabet] temp_df = pd.DataFrame(counts_array,columns=counts_cols) counts_df = pd.concat([poss,temp_df],axis=1) # Validate as counts dataframe counts_df = qc.validate_profile_ct(counts_df,fix=True) return counts_df