def df_list0(self, col_name, sep1=None, sep2=None): outlist = [] inlist = list(self.df[col_name]) if sep1 is None: outlist = list(set(inlist)) else: out_dict = {} if sep2 is None: #sep1 only for ele1 in inlist: ele1_list = ele1.split(sep1) for item in ele1_list: out_dict[item] = 0 else:#sep1 and spe2 both print('sep12') for ele1 in inlist: ele1 = str(ele1) ele1_list = ele1.split(sep1) for ele2 in ele1_list: item = ele2.split(sep2)[0] out_dict[item] = 0 #print len(out_dict.keys()) #unique items #print(out_dict) outlist = myList.basic(out_dict.keys()).sort_list() return outlist
def permute_taxon_blast(self, hits_num): print('permutation of viral blast:{}\t{}'.format(self.par['type'], hits_num)) # counts_df = pd.DataFrame() outfile = '{}{}.txt'.format(myIO.dir_os(self.par['dir_out']).create_dir(), hits_num) if os.path.isfile(outfile): print('Read file: ', outfile) counts_df = pd.read_csv(outfile, header=0, index_col=0, sep="\t", low_memory=False) else: #1: permutated peptides pep_names = list(self.par['binary_aln_df'].index) pep_df = myList.basic(pep_names).permute_list(self.par['permutation_times'], hits_num) #2: permutation based on the non-overlapped hits num for col, perm_pep in pep_df.items(): perm_zb = self.par['binary_aln_df'].ix[perm_pep] p_collapse_zb, p_sim_tag = myDataframe.basic(perm_zb).unispecie(self.par['sim_threshold']) counts_df[col] = p_collapse_zb.apply(sum,axis=0) + p_sim_tag #print list(perm_tmp[col]) #export counts_df.to_csv(outfile, sep='\t', header=True, index_label=self.par['type']) #combine permuated counts #print counts_df.shape perm_mean = counts_df.apply(lambda x: np.mean(np.floor(x)), axis=1).round() #print perm_mean return perm_mean
def df_list(self, col_name, sep1=None, sep2=None): outlist = [] out_dict = {} try: inlist = list(self.df[col_name]) except: print('No column name in the data frame:', col_name) else: if sep1 is None: outlist = list(set(inlist)) else: #out_dict={} if sep2 is None: #sep1 only for ele1 in inlist: ele1 = str(ele1)# avoid null list export ele1_list = ele1.split(sep1) for item in ele1_list: out_dict[item] = 0 else:#sep1 and spe2 both out_dict = {} for ele1 in inlist: ele1 = str(ele1) # avoid null list export ele1_list = ele1.split(sep1) for ele2 in ele1_list: item = ele2.split(sep2)[0] out_dict[item] = 0 #unique items outlist = myList.basic(out_dict.keys()).sort_list() finally: #print outlist pass return outlist
def combine_fa(self, fa_dict): chrs = myList.basic(fa_dict.keys()).sort_list() out_obj = open(self.biofile, 'wt') for chr_name in chrs: fa_file = fa_dict[chr_name] seq = genome(fa_file).read_fa_first() out_obj.write('>{}\n{}\n'.format(chr_name, seq)) print('\t{}:{}'.format(chr_name, fa_file)) out_obj.close() print('Combine fa files into ', self.biofile)
def hits_func(x, peps, threshold, pro_id): #signficant hits hits = x[x >= threshold] #non_overlapping peptides peps = [str(x) for x in peps] hit_peps = [str(x) for x in hits.index] none_overlapped_hits_num = myList.basic(peps).un_neighbours( hit_peps, return_type='hits_num') #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num) #if len(hit_peps)>0: print pro_id, peps, hit_peps #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps return len(list( hits.index)), none_overlapped_hits_num, ','.join(hit_peps)
def __init__(self, par): #data frame if 'df' in par.keys(): self.data = par['df'] self.nrow = self.data.shape[0] self.ncol = self.data.shape[1] self.colnames = self.data.columns self.xlabel = par['xlabel'] if 'xlabel' in par.keys() else list( self.data)[0] self.ylabel = par['ylabel'] if 'ylabel' in par.keys() else list( self.data)[1] self.xlim = par['xlim'] if 'xlim' in par.keys() else myList.basic( self.data.ix[:, 0]).min_max() self.ylim = par['ylim'] if 'ylim' in par.keys() else myList.basic( self.data.ix[:, 1]).min_max() #initiate plot window plt.clf() elif 'list' in par.keys(): self.data = par['list'] self.xlabel = par['xlabel'] if 'xlabel' in par.keys() else 'x' self.ylabel = par['ylabel'] if 'ylabel' in par.keys() else 'y' self.xlim = par['xlim'] if 'xlim' in par.keys() else None self.ylim = par['ylim'] if 'ylim' in par.keys() else None else: self.data = None print('Error:No data frame input as long as drawing a plot!') #file self.picfile = par['picfile'] if 'picfile' in par.keys() else None #colors self.col = 'bgrcmykw' #line styles:solid, dashed, dotted, dashdot self.title = par['title'] if 'title' in par.keys() else 'Plot' self.text = par['text'] if 'text' in par.keys() else None self.legend = par['legend'] if 'legend' in par.keys() else False self.pch = par['pch'] if 'pch' in par.keys() else 'o' self.lty = par['lty'] if 'lty' in par.keys() else 'solid' self.lwd = par['lwd'] if 'lwd' in par.keys() else 1
def taxon_blast2(self, file_aln, zscore_file): taxon_type = myIO.file_os(file_aln).name_prefix() print("\n{}:{}\n".format(taxon_type, zscore_file)) #read zscore_df zdf = myDataframe.basic().standard_df(zscore_file) #match order of align score and zscore,replace na #read alignment file for specie alignment binary_b = myDataframe.basic().aln_df(file_aln, self.par['align_score']) #binary_b = myDataframe.basic(binary_b).filter_aln() binary_b = binary_b.reindex(zdf.index).fillna(0) #print binary_b #sample names in columns, and specie in rows sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf)) p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) #perm_df=pep_df.copy() #print binary_z.apply(sum, axis=0) #n=0 for sample_name, column in zdf.iteritems(): #n += 1 #1: select peptides #column=zscore_df.ix[:,20] #first remove all nont-hits hits = column[ column >= self.par['specieZ_threshold']].copy() #all hits hits.sort_values(axis=0, ascending=False, inplace=True) #print hits #remove overlapped hits nonoverlap_hits = myList.basic(hits).gen_ind_hits( self.par['dependent_pep']) input_num = len(nonoverlap_hits) print("{}:\thits={}, nonoverlapped={}".format( sample_name, len(hits), input_num)) #2: remove overlap hits between species if input_num > 0: zb_df = binary_b.loc[nonoverlap_hits.index] #print list(binary_b.apply(lambda x: sum(x), axis=0)) #loop collapse_zb, sim_tag, p_series = myDataframe.basic( zb_df).binom_unispecie(self.par['dir_ref_seq'], input_num, self.par['p_threshold'], self.par['x_threshold']) #counts of hits sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag #all peptide_id list pep_df[sample_name] = collapse_zb.apply( lambda x: myList.basic(x).names_string(0.001), axis=0) p_df[sample_name] = p_series #padjust_df[sample_name]=p_adjust_series #if n==5: break #n+=1 #export to file file_head = myIO.file_os( zscore_file).file_prefix() + '_' + taxon_type + '_' #file_head='random_min_HI_HC_'+taxon_type+'_' sum_df.to_csv(file_head + 'counting.txt', sep='\t', header=True, index_label='Specie') pep_df.to_csv(file_head + 'peptides.txt', sep='\t', header=True, index_label='Specie') p_df.to_csv(file_head + 'p-values.txt', sep='\t', header=True, index_label='Specie') #Adjusted p-values using B-H ''' stats = importr('stats') for i in p_df: pvalue_list = p_df[i].values p_adjust = list(stats.p_adjust(FloatVector(pvalue_list), method = 'BH')) padjust_df[i] = p_adjust padjust_df.to_csv(file_head+'p-adjusted.txt', sep='\t', header=True, index_label='Specie') ''' padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) for i in p_df.columns: pvals = np.array(p_df[i].values) if not np.isnan(pvals).all(): mask = [j for j in np.where(np.isfinite(pvals))[0]] pval_corrected = np.empty(pvals.shape) pval_corrected.fill(np.nan) pval_corrected[mask] = multipletests(pvals[mask], method='fdr_bh')[1] padjust_df[i] = pval_corrected padjust_df.to_csv(file_head + 'padjusted.txt', sep='\t', header=True, index_label='Specie') #perm_df.to_csv(file_head+'permutation.txt', sep='\t', header=True, index_label='Specie') #end
def taxon_blast(self, file_aln, zscore_file): print( '###Signficant taxon by removing overlapped hits based on blast alignment.' ) taxon_type = myIO.file_os(file_aln).name_prefix() print('{}: {}'.format(taxon_type, zscore_file)) #read zscore_df zdf = myDataframe.basic().standard_df(zscore_file) #match order of align score and zscore,replace na #read alignment file for specie alignment binary_b = myDataframe.basic().aln_df(file_aln, self.par['align_score']) binary_b = binary_b.reindex(zdf.index).fillna(0) #print binary_b #sample names in columns, and specie in rows sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf)) #perm_df = pep_df.copy() #print binary_z.apply(sum, axis = 0) #n = 1 for sample_name, column in zdf.items(): #1: select peptides #column = zscore_df.ix[:,20] #first remove all nont-hits hits = column[ column >= self.par['specieZ_threshold']].copy() #all hits hits.sort_values(axis=0, ascending=False, inplace=True) #print hits #remove overlapped hits nonoverlap_hits, overlap_debug = myList.basic(hits).remove_overlap( self.par['dependent_pep']) input_num = len(nonoverlap_hits) print('{}: hits={}, nonoverlapped={}'.format( sample_name, len(hits), input_num)) #2: remove overlap hits between species if input_num > 0: ###2-1: export peptides try: outfile = '{}{}/{}.csv'.format(self.par['dir_result'], sample_name, taxon_type) overlap_debug.to_csv(outfile, header=True, index_label='peptides') except FileNotFoundError: myIO.file_os(self.par['file_err'], "\t").line_replace( {'taxon_blast': sample_name}) ###2-2: specie-specific hits based on non-overlapped hits #sample zscore-alignscore matrix times by zscore #print(nonoverlap_hits.index) zb_df = binary_b.ix[nonoverlap_hits.index] #print(list(binary_b.apply(lambda x: sum(x), axis = 0))) #loop collapse_zb, sim_tag = myDataframe.basic(zb_df).unispecie( self.par['sim_threshold']) #counts of hits sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag #print(list(sum_df[sample_name])) #high_sum = sum_df[sample_name] #print(high_sum[high_sum>0]) #all peptide_id list pep_df[sample_name] = collapse_zb.apply( lambda x: myList.basic(x).names_string(0.001), axis=0) #2-3:permutation #perm_df[sample_name] = self.specie_alignment_permutation(input_num) #if n == 10: break #n+ = 1 #export to file file_head = '{}_{}_'.format( myIO.file_os(zscore_file).file_prefix(), taxon_type) sum_df.to_csv(file_head + 'counting.txt', sep='\t', header=True, index_label='Specie') pep_df.to_csv(file_head + 'peptides.txt', sep='\t', header=True, index_label='Specie')
def count_reads(self): #key is ref name, value is reads string sep by comma, the first is ref seq unique_seq = dict((a, []) for a in self.par['ref_dict'].keys()) #unique and multiple counts in dict unique = {} #key is ref name, value is counts multiple = {} # key is query name, value is the list of refs num = {}# counts statistics saturation = {0:{1:0, 5:0, 10:0, 'max':0 }} # count number for saturation analysis last_index = 0 print('\tread sam file: {}.gz'.format(self.par['sample_sam_file'])) IN = gzip.open(self.par['sample_sam_file']+'.gz', 'rt') UN = gzip.open(self.par['sample_dir']+self.par['sample_name']+'_unknown.fa.gz', 'wt') maxRC = 0 for line in IN: #print(line) #counts num['raw_reads_num'] = num.setdefault('raw_reads_num',0)+1 #analyze sam line info = self.analyze_SAM(line) qname, ref= info['qname'], info['ref'] #unique alignment if info['aligned'] == '1': unique[ref] = unique.setdefault(ref,0) + 1 if unique[ref] > maxRC: maxRC = unique[ref] #counting of saturation if unique[ref] in [1,5,10]: last_counts = saturation[last_index].copy()# copy() is essential!!!!! last_counts[unique[ref]] += 1 last_counts['max'] = maxRC#the maximum RC at the time of raw reads we get saturation[num['raw_reads_num']] = last_counts #print num['raw_reads_num'], last_index, saturation[num['raw_reads_num']] last_index = num['raw_reads_num'] #export aligned sequences of reads unique_seq[ref].append(info['seq']) num['unique_aligned_reads_num'] = num.setdefault('unique_aligned_reads_num',0)+1 #multiple alignment elif info['aligned'] == '3': multiple[qname] = multiple[qname] + [ref] if qname in multiple else [ref] num['multialigned_reads_num'] = num.setdefault('multialigned_reads_num',0)+1 #unalignment else: UN.write('>'+qname+'\n'+info['seq']+'\n') num['unaligned_reads_num'] = num.setdefault('unaligned_reads_num',0) + 1 IN.close() UN.close() #counting of saturation if num['raw_reads_num'] > last_index: saturation[num['raw_reads_num']] = saturation[last_index].copy() #for key in sorted(saturation.keys()): # print key, saturation[key] #upate num statistics myIO.file_os(self.par['sample_log'], '=').line_add(num) print('\tcombine RCs from unique and multiple alignments of ', self.par['sample_name']) #reversed multiple #print multiple rev_multiple = myDict.basic(multiple).counting_reversed_dict() #print unique RC_dict = self.multiple_counts(unique, rev_multiple) #export print('\tSave read counts into ', self.par['sample_RC_file']) myDict.basic(RC_dict).dict2_to_file(self.par['sample_RC_file'], pattern='\t') myDict.basic(saturation).dict2_to_file(self.par['sample_saturation_file'], pattern='\t') # seq_counts = {} for ref, reads_list in unique_seq.items(): key=ref+'\t'+self.par['ref_dict'][ref]+'\t'+str(len(reads_list)) if len(reads_list)>0: freq_dict = myList.basic(reads_list).elements_frequency0() seq_counts[key] = ';'.join(str(a)+':'+str(b) for a,b in freq_dict.items()) else: seq_counts[key] = 'NA' myDict.basic(seq_counts).dict_to_file(self.par['sample_dir']+'unique_aligned_reads.txt', pattern='\t')
def __init__(self, dictionary=None, dict_keys=None): self.dict = dictionary if isinstance(self.dict, dict): self.dict_keys = myList.basic(self.dict.keys()).sort_list( ) if dict_keys is None else dict_keys self.out_dict = {}
def __init__(self, dictionary=None): self.dict = dictionary if isinstance(self.dict, dict): self.sorted_keys = myList.basic(self.dict.keys()).sort_list() self.out_dict = {}
def QC_saturation(self): print("###saturation analysis\n") combined_df = {} combined_dynamics = {} #plot suaturation curve per sample #n=1 for sample_name in self.par['sample_names']: file_head = '{}{}/'.format(self.par['dir_result'], sample_name) #read saturation file df = pd.read_table(file_head + 'QC_saturation.txt', sep="\t", index_col=False) #print list(df) #print list(df.index) #saturation curves saturation_df = df[['row_name', '1', '5', '10']] #shrink dict shrinked_index = myList.basic(list( saturation_df.index)).interval_list() #print shrinked_index sample_df = saturation_df.ix[shrinked_index] #select rows #sample_df=sample_df.transpose().astype(float) sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6 #print sample_df #scatter plot plot_par = { 'df': sample_df, 'legend': 'upper left', 'title': 'Saturation analysis (Sequencing depth)', 'picfile': file_head + 'QC_saturation_analysis.png', 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Number of references' } myPlot.plot(plot_par).lineP() #combine data frame sample_df.index = range(sample_df.shape[0]) for cutoff in ['1', '5', '10']: sub_df = sample_df[['row_name', cutoff]].copy() sub_df.columns = ['raw_reads:' + sample_name, sample_name] if cutoff in combined_df: combined_df[cutoff] = pd.merge(combined_df[cutoff], sub_df, left_index=True, right_index=True, how='outer') else: combined_df[cutoff] = sub_df.copy() #dynamics analysis dynamics_df = df[['row_name', 'max']] #select df #shrink dict shrinked_index = myList.basic(list( dynamics_df.index)).interval_list() sample_df = dynamics_df.ix[shrinked_index] #select rows sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6 #divided by millions sample_df.reset_index(drop=True, inplace=True) #combined combined_dynamics[sample_name] = sample_df #plot plot_par = { 'df': sample_df, 'legend': 'upper left', 'title': 'Saturation analysis:dynamics of read conts', 'picfile': file_head + 'QC_read_counts_dynamics.png', 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Maximum read counts' } myPlot.plot(plot_par).lineP() #export saturated curves for cutoff in ['1', '5', '10']: plot_par = { 'df': combined_df[cutoff], 'legend': None, 'title': 'samples={}, RC-cutoff={}'.format( len(self.par['sample_names']), cutoff), 'picfile': '{}saturation_cuttoff_{}.png'.format(self.par['dir_QC'], cutoff), 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Number of references' } myPlot.plot(plot_par).lineP(x_value=1) #export dynamics curves combined_dynamics = pd.concat(combined_dynamics, axis=1) combined_dynamics.columns = [ ':'.join(x) for x in list(combined_dynamics) ] #print combined_dynamics.shape #print combined_dynamics plot_par = { 'df': combined_dynamics, 'legend': None, 'title': 'Sequencing depth,sample={}'.format(len(self.par['sample_names'])), 'picfile': '{}saturation_dynamics.png'.format(self.par['dir_QC']), 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Maximum read counts' } myPlot.plot(plot_par).lineP(x_value=1)