def qcrun(opt): # init fsample = opt.sample_info fmatrix = opt.matrixdata sampleinfo = mutilstats.SampleInfo() ret = sampleinfo.parse_sampleinfo(fsample) if ret <> 0: sys.stderr.write("[ERROR] Sample information failed to parse, please check the sampleinfo file\n") return 1 ## process data = mutilstats.MatrixAnno() if opt.log2tr == 1: ret = data.parse_matrix_anno(fmatrix,addtolog=opt.addbg,log2tr=1,cutoff=opt.noise) else: assert opt.log2tr == 0 ret = data.parse_matrix_anno(fmatrix,cutoff=opt.noise) if opt.normalize: ret = mutilstats.normalize(data.data) if ret <> 0: sys.stderr.write("[ERROR] Data parse failed, please check the matrix file\n") return 1 #1 CDF statplot.exprs_density(data.data,sampleinfo.classcolors,sampleinfo.classlabels,"exprs_CDF","expression level","cumulative distribution","cdf") #2 PDF statplot.exprs_density(data.data,sampleinfo.classcolors,sampleinfo.classlabels,"exprs_pdf","expression level","probability density distribution","pdf") #3 boxplot statplot.plot_boxplot(data.data,"exprs_boxplot","","expression level",sampleinfo.samplenames,colors=sampleinfo.classcolors,ylim=0) #4 RLE statplot.exprs_RLE(data.data,"mean","RLE_plot",sampleinfo.samplenames,colors=sampleinfo.classcolors) #5 corr corr_matrix = statplot.exprs_corrarray(data.data,sampleinfo.samplenames,"corrarray") ## out corr_matrix foutcorr = file("corr_matrix.xls","w") foutcorr.write("## correlation coefficient matrix for samples' expression level\n") foutcorr.write("\t".join(["#correlation"]+sampleinfo.samplenames)+"\n") for i in xrange(len(sampleinfo.samplenames)): foutcorr.write("\t".join([sampleinfo.samplenames[i],]+map(fmtout,corr_matrix[i,:].tolist()))+"\n") foutcorr.close() #6 cluster statplot.hcluster(data.data,sampleinfo.samplenames,"hcluster") #7 MDS #statdir = "Exprs" html_main = mhtml.simple_main(title="样本表达质量控制结果",css="../CSS") html_main.add_head("样本表达质量控制结果") html_main.add_enter() #html_main.add_back1() #html_main.add_enter() html_main.add_head("1. 样本信息列表",2) html_main.add_line() html_main.add_enter() tmptable,tmpnote = mhtml.xls2table("%s"%fsample) html_main.add_content(tmptable) html_main.add_precontent(tmpnote) html_main.add_enter() html_main.add_head("2. 样本表达质量控制结果,数据可靠性分析",2) html_main.add_line() html_main.add_enter() html_main.add_head("a. 样本表达水平概率密度分布",3) html_main.add_enter() if opt.log2tr == 1: strlog = "采用log<sub>2</sub>变换,并" else: strlog = "" html_main.add_content("""对各样本表达水平,%s计算概率密度。查看各样本及各组间表达水平的分布情况。概率密度估计采用Kernel density estimation, implementation in python with scipy(http://www.scipy.org/)"""%strlog) html_main.add_content("""<img src="./exprs_pdf.png" width="50%" /><a href="./exprs_pdf.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_head("b. 样本表达水平累积概率密度分布",3) html_main.add_enter() html_main.add_content("""对各样本表达水平,%s计算累积概率密度。查看各样本及各组间表达水平的分布情况。"""%strlog) html_main.add_content("""<img src="./exprs_CDF.png" width="50%" /><a href="./exprs_CDF.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_head("c. 样本表达水平箱式图",3) html_main.add_content("""对各样本表达水平,%s绘制箱式图。查看各样本及各组间表达水平的分布情况。"""%strlog) html_main.add_content("""<img src="./exprs_boxplot.png" width="50%" /><a href="./exprs_boxplot.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_head("d. 样本间表达相关性分析",3) html_main.add_enter() html_main.add_content("""计算样本两两间的fisher 相关系数, 将相关系数矩阵按实验分组形式,绘制成热图。样本处理组间,大部分表达具有相关性,主要是因为维持生命基本活动的大部分基因均不差异表达,只有少部分为差异表达(当处理条件十分剧烈时,实验组和处理组间可能并不满足此假设,但组内样本应满足此假设)。因此,各样本间,表达水平相关性应较高。若图中存在特异性的样本,或实验条件不统一且未做校正时,该特殊样本与其他样本的表达相关性会非常低。""") html_main.add_content("""<img src="./corrarray.png" width="50%" /><a href="./corrarray.svg">SVG矢量图版本</a>""") tmptable,tmpnote = mhtml.xls2table("corr_matrix.xls") html_main.add_content(tmptable) html_main.add_enter() html_main.add_head("e. 样本相对表达水平比较",3) html_main.add_enter() html_main.add_content("""在同一组实验中,即使是相互比较的对照组与实验组之间,大部分基因的表达量还是应该保持一致的。当使用相对对数表达水平(Relative Log Expression(RLE))的箱线图来控制不同组之间的实验质量时,箱线图应该在垂直中央相类的位置(通常接近0)。如果有一个样本的表现和其它的平行组都很不同,那说明它可能出现了质量问题。""") html_main.add_content("""<img src="./RLE_plot.png" height="450" width="550" /><a href="./RLE_plot.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_head("f. 样本聚类结果",3) html_main.add_enter() html_main.add_content("""基于表达水平数据的样本聚类,计算样本间欧式距离,采用离差平方和法(wald法)进行层次聚类,验证聚类结果是否同实验设计基本一致。若聚类结果明显不一致,则样本间存在着明显的其他未知的因素,而不仅仅是实验处理效应。""") html_main.add_content("""<img src="./hcluster_hcluster.png" width="50%" /><a href="./hcluster_hcluster.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_head("g. 样本多维尺度分析",3) html_main.add_content("多维尺度分析(Multi Dimensional Scaling, MDS)是一种将多维空间的研究对象简化到低维空间进行定位、分析和归类,同时又保留对象间原始关系的数据分析方法。此处我们采用样本间欧式距离反映样本间的差异,选择前3个本征值最大的维度,绘制样本在前三个维度上的分布,若实验处理因素为表达差异的主要因素,则一般而言,样本组内差异应小于组间差异。") sinfo = sampleinfo snnum = len(sinfo.sns) mdsout = mds.mds_ps(data.data,10) xlabel = "Number of dimensions" ylabel = "Variation percentage" statplot.plotline(np.arange(0,len(mdsout.p)+1),np.asarray([[0,]+mdsout.p.tolist(),]),"Variation_percentage",xlabel,ylabel,['r^-'],xlimmax=10+1,ylimmax=102) if snnum == 2: ret = statplot.plot_Xscore(mdsout.v,sinfo.classnums,sinfo.uniqclassnum,sinfo.uniqcolor,sinfo.uniqmarker,sinfo.uniqclasslabel,"MDS_samples_distribution","1st dimension","2nd dimension","3rd dimension",dim=2) html_main.add_content("前n个维度累积解释变异的百分比图(见下图)。其中,前两个维度,累积解释变异的百分比: %.2f%%, %.2f%%"%(mdsout.p[0],mdsout.p[1])) elif snnum >= 3: ret = statplot.plot_Xscore(mdsout.v,sinfo.classnums,sinfo.uniqclassnum,sinfo.uniqcolor,sinfo.uniqmarker,sinfo.uniqclasslabel,"MDS_samples_distribution","1st dimension","2nd dimension","3rd dimension",dim=3) html_main.add_content("前n个维度累积解释变异的百分比图(见下图)。其中,前三个维度,累积解释变异的百分比: %.2f%%, %.2f%%, %.2f%%"%(mdsout.p[0],mdsout.p[1],mdsout.p[2])) html_main.add_content("""<img src="./Variation_percentage.png" width="50%" /><a href="./Variation_percentage.svg">SVG矢量图版本</a>""") html_main.add_enter() html_main.add_content("""样本在前三个维度中的空间分布图""") html_main.add_content("""<img src="./MDS_samples_distribution.png" width="50%" /><a href="./MDS_samples_distribution.svg">SVG矢量图版本</a>""") html_main.add_enter() f = file("exprs_samples_qc.html","w") f.write(str(html_main)) f.close() return 0
def mds_ps(X_raw,nvs_output=10): """ Here, a mutil demensional scale method was used to calculate the population structure. I think this must be improve, we will check the reliability of method for population structure analysis, soon. Input: nvs_output: is number of reduced demensions of raw data X_SNPs: is the same as it in plsgwas Output: w : a list (len(list) = nvs_output) of eigenvalue v : a matrix of eigenvector corresponds to the eigenvalue """ X_SNPs = X_raw.copy() if X_SNPs.dtype == np.float64 or X_SNPs.dtype == np.float32: xtype = X_SNPs.dtype pass else: sys.stderr.write("""The format of X_SNPs matrix should be numpy.float32 or numpy.float64, please check it. If the memory is sufficient, we suggest you use the numpy.float64\n""") exit(1) nx,px = X_SNPs.shape X_SNPs = np.asmatrix(X_SNPs) if nvs_output>nx: sys.stderr.write('too many nvs_output, it must be smaller than number of samples, we have changed auto\n') nvs_output = min(nx,nvs_output) mutilstats.centring(X_SNPs) mutilstats.normalize(X_SNPs) #print X_SNPs dist = np.asmatrix(np.zeros((nx,nx))) for i in xrange(nx): temp = X_SNPs - X_SNPs[i,:] temp = np.power(temp,2) dist[:,i] = np.power(np.sum(temp,axis=1),0.5) I = np.asmatrix(np.eye(nx)) I_n = np.asmatrix(np.ones((nx,nx))) dist = -1*(I-(1.0/nx)*I_n)*dist*(I-(1.0/nx)*I_n)/2 del I_n del I w,v=np.linalg.eig(dist) del dist idx = np.argsort(w)[::-1] w = w[idx] v = v[:,idx] precent = np.cumsum(w)/np.sum(w) * 100 mds_output = mdsoutput() mds_output.p = precent[0:nvs_output] mds_output.w = w[0:nvs_output] mds_output.v = v[:,0:nvs_output] """ w=list(w) wtemp=w[:] wtemp.sort() last=-1 vector_ind = [] return_v = np.asmatrix(np.zeros((nx,nvs_output))) while nvs_output: vector_ind.append(w.index(wtemp[last])) last -= 1 nvs_output -= 1 return_w = [] while vector_ind: ind = vector_ind.pop(0) return_w.append(w[ind]) return_v[:,nvs_output] = v[:,ind] nvs_output += 1 """ return mds_output
def MutSubPattern(annofns, outdir="./", target_region=None): #annoregion = ['downstream','exonic','intergenic','intronic','ncRNA_exonic','ncRNA_intronic','ncRNA_UTR3','ncRNA_UTR5','splicing','upstream','UTR3','UTR5'] #flag = 0 #if target_region in annoregion: # flag = 1 sample_arr = [] indel_c_arr = [] snp_c_arr = [] C_A_arr = [] # C->A / G->T C_T_arr = [] # C->T / G->A C_G_arr = [] # C->G / G->C T_A_arr = [] # T->A / A->T T_C_arr = [] # T->C / A->G T_G_arr = [] # T->G / A->C for i in xrange(len(annofns)): samplename = annofns[i].split( os.sep)[-1].split(".")[0].split("_vs_")[0] # to variant_function = file(annofns[i], "r") indel_c = 0 snp_c = 0 snp_CA = 0 snp_CT = 0 snp_CG = 0 snp_TA = 0 snp_TC = 0 snp_TG = 0 line = variant_function.next() if not line: continue if line[0:3] == "Chr" or line[0] == "#": pass else: variant_function.seek(0) for line in variant_function: if line.startswith("#"): continue if line.startswith("Note:"): break arr = line.split("\t") try: ref = arr[3] alt = arr[4] assert ref != alt except: print arr if ref == "-" or alt == "-": indel_c += 1 elif len(ref) != len(alt): indel_c += 1 else: snp_c += 1 if ref == "C": if alt == "A": snp_CA += 1 elif alt == "T": snp_CT += 1 elif alt == "G": snp_CG += 1 elif ref == "G": if alt == "T": snp_CA += 1 elif alt == "A": snp_CT += 1 elif alt == "C": snp_CG += 1 elif ref == "T": if alt == "A": snp_TA += 1 elif alt == "C": snp_TC += 1 elif alt == "G": snp_TG += 1 elif ref == "A": if alt == "T": snp_TA += 1 elif alt == "G": snp_TC += 1 elif alt == "C": snp_TG += 1 sample_arr.append(samplename) C_A_arr.append(snp_CA) C_T_arr.append(snp_CT) C_G_arr.append(snp_CG) T_A_arr.append(snp_TA) T_C_arr.append(snp_TC) T_G_arr.append(snp_TG) indel_c_arr.append(indel_c) snp_c_arr.append(snp_c) variant_function.close() leng = len(sample_arr) mut_stat_xls = file(outdir + "/" + "Mutation_pattern.xls", "w") mut_stat_xls.write("#Variant\t" + "\t".join(sample_arr) + "\n") if len(C_A_arr) == leng and len(C_T_arr) == leng and len( C_G_arr) == leng and len(T_A_arr) == leng and len( T_C_arr) == leng and len(T_G_arr) == leng and len( indel_c_arr) == leng and len(snp_c_arr) == leng: #mut_stat_xls.write("indel_count\t"+"\t".join(map(str,indel_c_arr))+"\n") mut_stat_xls.write("SNP_count\t" + "\t".join(map(str, snp_c_arr)) + "\n") mut_stat_xls.write("C->A/G->T\t" + "\t".join(map(str, C_A_arr)) + "\n") mut_stat_xls.write("C->T/G->A\t" + "\t".join(map(str, C_T_arr)) + "\n") mut_stat_xls.write("C->G/G->C\t" + "\t".join(map(str, C_G_arr)) + "\n") mut_stat_xls.write("T->A/A->T\t" + "\t".join(map(str, T_A_arr)) + "\n") mut_stat_xls.write("T->C/A->G\t" + "\t".join(map(str, T_C_arr)) + "\n") mut_stat_xls.write("T->G/A->C\t" + "\t".join(map(str, T_G_arr)) + "\n") #mut_stat_xls.write("InDel\t"+"\t".join(map(str,indel_c_arr))+"\n") mut_stat_xls.close() #tot_snp = sum(snp_c_arr); tot1 = sum(C_A_arr) tot2 = sum(C_T_arr) tot3 = sum(C_G_arr) tot4 = sum(T_A_arr) tot5 = sum(T_C_arr) tot6 = sum(T_G_arr) labels = [ "C->A/G->T", "C->T/G->A", "C->G/G->C", "T->A/A->T", "T->C/A->G", "T->G/A->C" ] fracs = [tot1, tot2, tot3, tot4, tot5, tot6] h = {} for i in xrange(6): h[labels[i]] = fracs[i] bar_dict(h, "total_snp_substitution", "Substitution", "Counts", fmt="%d") plot_data = np.asmatrix( np.float64( np.asarray( (C_A_arr, C_T_arr, C_G_arr, T_A_arr, T_C_arr, T_G_arr)))) stackv_bar_plot(plot_data, sample_arr, "SNP_substitution_pattern", "", "Percentage", width=0.5, legends=labels, scale=1, orientation="horizontal", rotation=0) # p * n plot_2 = plot_data.T[0:, :] plot_2 = np.asarray(plot_2) plot_2new = plot_2.T / np.sum(plot_2.T, axis=0) #print np.sum(plot_2new.T,axis=1) plot_2new = plot_2new.T centring(plot_2new) normalize(plot_2new) if len(sample_arr) > 1: statplot.cluster_heatmap( np.asmatrix(plot_2new), sample_arr, labels, fig_prefix="SNP_substitution_Mutation_Spectrum", colornorm=1, nosample=False, nogene=True, plotxlabel=1, plotylabel=1, cbarlabel="Normalized Frequency", trees=3) return 0