def QC_statistics(self): print("###Quality control: statistics summary") #print(self.par['sample_names']) #print(self.par['dir_result']) stat_dict = collections.defaultdict(dict) for sample_name in self.par['sample_names']: sample_log = '{}{}/{}.log'.format(self.par['dir_result'], sample_name, sample_name) stat_dict[sample_name] = myIO.file_os(sample_log, '=').to_dict() #convert to data frame stat_df = pd.DataFrame(stat_dict) stat_df = stat_df.transpose() #1: scatter plot 1 sub_df = stat_df[['raw_reads_num', 'unique_aligned_reads_num']].astype(float)/1e6 #print sub_df plot_par={'df':sub_df, 'title':'raw_reads_vs_aligned_reads', 'picfile':self.par['dir_QC'] + 'raw_reads_vs_aligned_reads.png', 'pch':'o', 'text':'million reads'} myPlot.plot(plot_par).dotP() #2: scatter plot 2 stat_df['unique_aligned_percentage'] = sub_df['unique_aligned_reads_num']*100/sub_df['raw_reads_num'] plot_par['df'] = stat_df[['raw_reads_num','unique_aligned_percentage']].astype(float) plot_par['title'] = 'percentage_aligned_reads' plot_par['picfile'] = self.par['dir_QC'] + 'percentage_aligned_reads.png' myPlot.plot(plot_par).dotP() #3: export to csv file print('\tSave statistical summary into {}.'.format(self.par['file_stat'])) stat_df.to_csv(self.par['file_stat'], index_label='sample_names')
def main(): # prepareInput.createInput(logName) # scores=[] # #----------start Trace2Vec # Trace2Vec.learn(logName,vectorsize) # y=Trace2Vec.getY(logName) # vectors, corpus=Trace2Vec.startCluster(logName, vectorsize) # printMatrix(vectors, "Trace2Vec", "vectors") # for alg in clustering: # assigned_clusters=cluster(alg, vectors, y) # print(np.amax(assigned_clusters)) # printVector(assigned_clusters, "Trace2Vec", "clusters", alg) # Trace2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) # scores.append(get_scores("Trace2Vec")) # #----------end Trace2Vec # #----------start Node2Vec # args=Node2Vec.parse_args() # args.input="input/"+logName+".graph" # args.output="output/"+logName+"N2VVS"+str(vectorsize)+".node2vec" # nx_G = Node2Vec.read_graph(args) # G = node2vec.Graph(nx_G, True, args.p, args.q) # G.preprocess_transition_probs() # walks = G.simulate_walks(args.num_walks, args.walk_length) # Node2Vec.learn_embeddings(args, logName, vectorsize, walks) # Node2Vec.extract(logName, vectorsize) # y=Node2Vec.getY(logName) # vectors, corpus=Node2Vec.startCluster(logName, vectorsize) # printMatrix(vectors, "Node2Vec", "vectors") # for alg in clustering: # assigned_clusters=cluster(alg, vectors, y) # print(np.amax(assigned_clusters)) # printVector(assigned_clusters, "Node2Vec", "clusters", alg) # Node2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) # scores.append(get_scores("Node2Vec")) # #----------end Node2Vec # #----------start NGrams # vectors, y=NGrams.ngrams_BPI_2015(logName, vectorsize) # printMatrix(vectors, "NGrams", "vectors") # for alg in clustering: # assigned_clusters=cluster(alg, vectors, y) # print(np.amax(assigned_clusters)) # printVector(assigned_clusters, "NGrams", "clusters", alg) # NGrams.endCluster(logName, assigned_clusters, vectorsize, alg, [0]*len(vectors)) # scores.append(get_scores("NGrams")) # #----------end NGrams # for score in scores: # print_scores(score) for emb in embed: myPlot.plot(emb)
def QC_hits(self, infile, threshold=None): print('###Relationship between significant hits and raw read num of ', infile) file_prefix = '{}{}_'.format(self.par['dir_QC'], myIO.file_os(infile).name_prefix()) if threshold is None: threshold = float(self.par['zscore_threshold']) #read statistics file stat_df = pd.read_table(self.par['file_stat'], sep=",", index_col=0, low_memory=False) stat_df.index = stat_df['sample_name'] #assign row names stat_df = stat_df.ix[self.par['sample_names']]#order rows by sample_names raw_reads = stat_df['raw_reads_num']/1e6 #print stat_df[['sample_name','raw_reads_num']] #read values file in_df = pd.read_table(infile, sep="\t", index_col=0, low_memory=False)#rownames and colnames order_df = in_df[self.par['sample_names']].copy()#order columns #print(order_df.shape) #plot of raw reads vs number of hits #print list(order_df) def func1(x,y=threshold): sig = x[x>=y] return len(sig) hits_num = order_df.apply(func1, axis=0) #get compared df comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_num}) comp_df.to_csv(file_prefix+'raw_vs_sighits.csv', sep=',') #plot plot_par={'df':comp_df, 'legend':None, 'title': 'Effects of sequencing depth on significant hits', 'picfile': file_prefix + 'raw_vs_sighits.png', 'xlabel':'Number of raw reads (million)', 'ylabel':'Number of signficant hits'} myPlot.plot(plot_par).dotP() #plot of raw reads vs mean values of hits #print list(order_df) def func2(x,y=threshold): x = pd.Series(x) #print list(x) sig = x[x>=y] #print list(sig) sig_mean = np.mean(sig) return sig_mean hits_mean = order_df.apply(func2, axis=0) #print hits_mean #get compared df comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_mean}) outfile=file_prefix+'raw_vs_mean_significant_hits.csv' print('\texport QC to {}.'.format(outfile)) comp_df.to_csv(outfile, sep=',') #plot plot_par={'df':comp_df, 'legend':None, 'title': 'Effects of sequencing depth on significant hits', 'picfile': file_prefix + 'raw_vs_mean_significant_hits.png', 'xlabel':'Number of raw reads (million)', 'ylabel':'Mean values of signficant hits'} myPlot.plot(plot_par).dotP()
def main(): prepareInput.createInput(logName) scores=[] #----------start Trace2Vec Trace2Vec.learn(logName,vectorsize) y=Trace2Vec.getY(logName) vectors, corpus=Trace2Vec.startCluster(logName, vectorsize) printMatrix(vectors, "Trace2Vec", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "Trace2Vec", "clusters", alg) Trace2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) #----------end Trace2Vec #----------start Node2Vec args=Node2Vec.parse_args() args.input="input/"+logName+".graph" args.output="output/"+logName+"N2VVS"+str(vectorsize)+".node2vec" nx_G = Node2Vec.read_graph(args) G = node2vec.Graph(nx_G, True, args.p, args.q) G.preprocess_transition_probs() walks = G.simulate_walks(args.num_walks, args.walk_length) Node2Vec.learn_embeddings(args, logName, vectorsize, walks) Node2Vec.extract(logName, vectorsize) y=Node2Vec.getY(logName) vectors, corpus=Node2Vec.startCluster(logName, vectorsize) printMatrix(vectors, "Node2Vec", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "Node2Vec", "clusters", alg) Node2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) #----------end Node2Vec #----------start NGrams vectors, y=NGrams.ngrams_BPI_2015(logName, vectorsize) printMatrix(vectors, "NGrams", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "NGrams", "clusters", alg) NGrams.endCluster(logName, assigned_clusters, vectorsize, alg, [0]*len(vectors)) #----------end NGrams scores.append(get_scores("Trace2Vec")) scores.append(get_scores("Node2Vec")) scores.append(get_scores("NGrams")) for score in scores: print_scores(score) if vectorsize==2: for emb in embed: myPlot.plot(emb)
def NC_whole_std(self): print('\tPolynomial regression of std~median across ALL BEADS-ONLY.') file_prefix = '{}{}_'.format(self.par['dir_result'], myIO.file_os(self.par['file_NC']).name_prefix()) norm_ncfile = file_prefix+'scalingRC.txt' if os.path.isfile(norm_ncfile): phip_nc = pd.read_csv(norm_ncfile, sep='\t', index_col=0, low_memory=False) else: phip_nc = normalization(self.par, self.par['file_NC'], norm_ncfile).RC_scaling() #print(phip_nc.shape) #summary of nc: mean and std NC=pd.DataFrame({'mean':phip_nc.mean(axis=1), 'median':phip_nc.median(axis=1), \ 'std':phip_nc.std(axis=1), 'sum':phip_nc.sum(axis=1)}) NC['median'][NC['median']==0] = np.nan NC['std'][NC['std']==0] = np.nan NC['logmedian'] = np.log10(NC['median']) NC['logstd'] = np.log10(NC['std']) #NC=NC.replace([np.inf, -np.inf], -10) #an extreme small value # #initiate reg_df for regression #fill out outliers reg_df = NC.loc[(NC['median']>0),:].copy() #order for polynomial regression reg_df = reg_df.sort_values(['logmedian'], ascending=True) #polynomial regression formula = 'logstd~logmedian+I(logmedian**2)+I(logmedian**3)' pn_model = smf.ols(formula, data=reg_df) pn_fit = pn_model.fit() #print(pn_fit.params) reg_df['pred_logstd'] = pn_fit.predict() reg_df['pred_std'] = 10**pn_fit.predict() NC['pred_logstd'] = pn_fit.predict({'logmedian':NC['logmedian']}) NC['pred_std'] = 10**NC['pred_logstd'] #refresh total log #params=dict(pn_fit.params) #NC_dict = dict([('polynomial_NC_std:' + x, params[x]) for x in params.keys()]) #myIO.file_os(self.par['file_total_log'], '=').line_replace(NC_dict) #export fitting of std NC.to_csv(file_prefix+'polynomial_std.csv', header=True, index_label='row_names') #draw graph xm=round(np.nanmax(list(NC['logmedian']))) ym=round(np.nanmax(list(NC['logstd']))) plot_par={'df': NC[['logmedian','logstd']], 'xlim':(-.5,xm), 'ylim':(-.5,ym),\ 'picfile':file_prefix+'polynomial_std.png', 'text':pn_fit.params } try: myPlot.plot(plot_par).regressionP(reg_df['logmedian'], reg_df['pred_logstd']) except ValueError: print('Failed to drawing pic and save into {}'.format(plot_par['picfile'])) #return fitting model object return NC, pn_fit
def export_df(self, outfile, threshold=10, index_label='row_names'): print('\texport data frame to ', outfile) outsep = ',' if outfile.endswith('.csv') else '\t' self.df.to_csv(outfile, sep=outsep, index_label=index_label) #draw a scatterplot counts = self.df.apply(lambda x, y=threshold: len(x[x>=y]), axis=0) #print counts plot_par={'list':counts, 'ylabel':'Sample_names', 'xlabel':'Number of hits', 'picfile': myIO.file_os(outfile).file_prefix()+'.png', 'title': 'Number of hits, threshold='+str(threshold) } myPlot.plot(plot_par).simple_barh()
def export_regress(self, file_prefix): #export fitting of std plot_par = { 'df': self.data[[self.x_name, self.y_name]], 'xlim': (-.5, np.nanmax(self.data[self.x_name])), 'ylim': (-.5, np.nanmax(self.data[self.x_name])), 'text': self.lm['params'] } if self.outdir is not None: file_prefix = '{}{}_{}'.format(file_prefix, self.x_name, self.y_name) self.data.to_csv(file_prefix + '.csv', header=True, index_label='row_names') plot_par['picfile'] = file_prefix + '.png' #draw graph try: myPlot.plot(plot_par).regressionP(self.reg_df[self.x_name], self.reg_df[self.py_name]) except ValueError: print('Failed to drawding {}'.format(file_prefix))
def QC_saturation(self): print("###saturation analysis\n") combined_df = {} combined_dynamics = {} #plot suaturation curve per sample #n=1 for sample_name in self.par['sample_names']: file_head = '{}{}/'.format(self.par['dir_result'], sample_name) #read saturation file df = pd.read_table(file_head + 'QC_saturation.txt', sep="\t", index_col=False) #print list(df) #print list(df.index) #saturation curves saturation_df = df[['row_name', '1', '5', '10']] #shrink dict shrinked_index = myList.basic(list( saturation_df.index)).interval_list() #print shrinked_index sample_df = saturation_df.ix[shrinked_index] #select rows #sample_df=sample_df.transpose().astype(float) sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6 #print sample_df #scatter plot plot_par = { 'df': sample_df, 'legend': 'upper left', 'title': 'Saturation analysis (Sequencing depth)', 'picfile': file_head + 'QC_saturation_analysis.png', 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Number of references' } myPlot.plot(plot_par).lineP() #combine data frame sample_df.index = range(sample_df.shape[0]) for cutoff in ['1', '5', '10']: sub_df = sample_df[['row_name', cutoff]].copy() sub_df.columns = ['raw_reads:' + sample_name, sample_name] if cutoff in combined_df: combined_df[cutoff] = pd.merge(combined_df[cutoff], sub_df, left_index=True, right_index=True, how='outer') else: combined_df[cutoff] = sub_df.copy() #dynamics analysis dynamics_df = df[['row_name', 'max']] #select df #shrink dict shrinked_index = myList.basic(list( dynamics_df.index)).interval_list() sample_df = dynamics_df.ix[shrinked_index] #select rows sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6 #divided by millions sample_df.reset_index(drop=True, inplace=True) #combined combined_dynamics[sample_name] = sample_df #plot plot_par = { 'df': sample_df, 'legend': 'upper left', 'title': 'Saturation analysis:dynamics of read conts', 'picfile': file_head + 'QC_read_counts_dynamics.png', 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Maximum read counts' } myPlot.plot(plot_par).lineP() #export saturated curves for cutoff in ['1', '5', '10']: plot_par = { 'df': combined_df[cutoff], 'legend': None, 'title': 'samples={}, RC-cutoff={}'.format( len(self.par['sample_names']), cutoff), 'picfile': '{}saturation_cuttoff_{}.png'.format(self.par['dir_QC'], cutoff), 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Number of references' } myPlot.plot(plot_par).lineP(x_value=1) #export dynamics curves combined_dynamics = pd.concat(combined_dynamics, axis=1) combined_dynamics.columns = [ ':'.join(x) for x in list(combined_dynamics) ] #print combined_dynamics.shape #print combined_dynamics plot_par = { 'df': combined_dynamics, 'legend': None, 'title': 'Sequencing depth,sample={}'.format(len(self.par['sample_names'])), 'picfile': '{}saturation_dynamics.png'.format(self.par['dir_QC']), 'xlabel': 'Number of raw reads (million)', 'ylabel': 'Maximum read counts' } myPlot.plot(plot_par).lineP(x_value=1)