def tu_Gussian(self,dataname="None",X=None,TrainData=None,choice=12): #测试数据集 if X==None: X,y=make_blobs(n_samples=100,n_features=3,centers=[[3,3, 3], [0,0,0], [1,1,1], [2,2,2]], cluster_std=[0.2, 0.1, 0.2, 0.2], random_state =9) n=X.shape[1] m=X.shape[0] if TrainData==None: col=np.random.randint(0,1,(98,1)) col2=np.random.randint(1,2,(2,1)) TrainData=np.column_stack((X,np.row_stack((col,col2)))) #高斯模型 mu1,sigms1=self.estimateGaussion(X) if choice==1: px_one=gaussian(sigms1,X,mu1) if n<=10: scatter=Scatter("featurn") for j in range(0,n): scatter.add(str(j),X[:,j],px_one[:,j]) else: px_one=self.multivariateGaussian(X,mu1,sigms1) scatter=Scatter("featurn") def f(x): y=1 for i in range(n): y=y*x[i] return y scatter.add("总体分布",map(f,X),px_one) #交叉验证,取得最好epsilon len = TrainData.shape[1] Xval=TrainData[:,0:-1] Yval=TrainData[:,-1] pvals=[] #各个特征值的概率相乘 if choice==1: pval=gaussian(sigms1,Xval,mu1) for i in range(0,m): pvals.append(reduce(mul,pval[i,:])) else: pvals=self.multivariateGaussian(Xval,mu1,sigms1) epsilon,F1=self.selectThreshold(Yval,pvals) yc=[0] #异常点为0 def filteryc(x): return x[n-1] in yc newdata=filter(filteryc,X) save_helper.save_txt_helper(newdata,dataname) outliers=np.where(px_one<epsilon) scatter2=ksh.ksh_scatter("离散点异常分布图","正常点",X,"FG","异常点",X[outliers]) self.page.add(scatter) self.page.add(scatter2) save_helper.save_tu_helper(self.page,dataname)
def mst(self, data=None, dataN=None, dataname="None", choice="prim"): page = Page() if data == None: max_value = 9999999 row0 = [0, 7, max_value, max_value, max_value, 5] row1 = [7, 0, 9, max_value, 3, max_value] row2 = [max_value, 9, 0, 6, max_value, max_value] row3 = [max_value, max_value, 6, 0, 8, 10] row4 = [max_value, 3, max_value, 8, 0, 4] row5 = [5, max_value, max_value, 10, 4, 0] data = [row0, row1, row2, row3, row4, row5] dataN = ["节点1", "节点2", "节点3", "节点4", "节点5", "节点6"] #对初始数据可视化 link = [] node = [] n = len(data) m = len(data[0]) for i in range(n): for j in range(m): if data[i][j] == max_value: continue else: link.append({"source": dataN[i], "target": dataN[j]}) fdata = filter(lambda x: x != max_value, data[i]) big = reduce(lambda x, y: x + y, fdata) node.append({"name": dataN[i], "symbolSize": big}) tu_graph = Tu_Graph("总关系图") tu_graph.add("", node, link) page.add(tu_graph) graph = tree.Graph(data) if choice == "prim": res = graph.prim() else: res = graph.kruskal() print(res) n1 = len(res) m1 = len(res[0]) def sum2(x, y): if type(x) == int: return x + y[2] else: return x[2] + y[2] big = reduce(sum2, res) link2 = [] for i in res: link2.append({"source": i[0], "target": i[1]}) tu_graph2 = Tu_Graph("最小生成树图") tu_graph2.add("权重和为:" + str(big), node, link2) print(link2) page.add(tu_graph2) sh.save_tu_helper(page, dataname)
def GuanJianCi(self, data_name="None", num=20, text=None): page = Page() if text == None: text = "SimHash是一种局部敏感hash,它也是Google公司进行海量网页去重使用的主要算法。传统的Hash算法只负责将原始内容尽量均匀随机地映射为一个签名值,原理上仅相当于伪随机数产生算法。传统的hash算法产生的两个签名,如果原始内容在一定概率下是相等的;如果不相等,除了说明原始内容不相等外,不再提供任何信息,因为即使原始内容只相差一个字节,所产生的签名也很可能差别很大。所以传统的Hash是无法在签名的维度上来衡量原内容的相似度,而SimHash本身属于一种局部敏感哈希算法,它产生的hash签名在一定程度上可以表征原内容的相似度。我们主要解决的是文本相似度计算,要比较的是两个文章是否相似,当然我们降维生成了hash签名也用于这个目的。看到这里估计大家就明白了,我们使用的simhash就算把文章中的字符串变成 01 串也还是可以用于计算相似度的,而传统的hash却不行。" tags = jieba.analyse.extract_tags(text, topK=num, withWeight=True, withFlag=True) name = [] value = [] for tag in tags: name.append(tag[0]) value.append(tag[1]) print(value) wordCloud = WordCloud(data_name) wordCloud.add("", name, value) pie = Pie('前十个词汇占重', "", title_pos='center') style = Style() pie_style = style.add(label_pos="center", is_label_show=True, label_text_color=None) hight = 10 width = 30 sum_Wight = sum(value) for index, (n, v) in enumerate(zip(name, value)): if index == 5: hight = 10 width = width + 40 if index < 10: pie.add("", [n, ""], [v / sum_Wight, 1 - v / sum_Wight], center=[hight, width], radius=[18, 24], **pie_style) hight = hight + 20 print(hight, width) print index page.add(pie) page.add(wordCloud) save_helper.save_tu_helper(page, data_name)
def tu_spca(self, dataname="kong", components_n=1, data=None): #测试数据 X, y = make_blobs(n_samples=10000, n_features=3, centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]], cluster_std=[0.2, 0.1, 0.2, 0.2], random_state=9) if data == None: data = X message = [] #训练数据 spca = SparsePCA(n_components=components_n, normalize_components=True, random_state=0) spca.fit(X) #保存数据 value = spca.transform(X) save_helper.save_txt_helper(value, dataname) components = spca.components_ error = spca.error_ page2 = Page() #绘图 for j in range(0, components.shape[0]): bar1 = Bar("稀疏组建" + str(j)) bar1.add("", [ "components_" + str(i) for i in range(0, components.shape[1]) ], components[j]) page2.add(bar1) message.append("我们仅提供稀疏组建和数据误差供给分析") print(error) bar2 = Bar("数据误差分析") bar2.add("", ["error" + str(i) for i in range(0, len(error))], error) page2.add(bar2) save_helper.save_tu_helper(page2, dataname) return message
def corr_m(self, dataname, data_place): if data_place != None and os.path.exists(data_place): df = pd.read_excel(data_place, header=0, index=None) else: message = "数据不存在,或者数据格式错误" return message col_name = df.columns.values ax = df.corr() len = ax.shape[1] array = np.array(ax) rank = array.argsort() first_rank = rank[:, -2] first_name = [] for j in range(len - 1): first_name.append(col_name[first_rank[j]]) page = Page() bar = Bar("相关性分析") page.add(bar) for j in range(len - 1): bar.add(col_name[j], col_name, array[j], is_more_utils=True, is_datazoom_show=True, datazoom_type="both", is_datazoom_extra_show=True, datazoom_extra_type="slider") save_helper.save_tu_helper(page, dataname) message = [] message.append("如果出现NAN数据说明该列不是数值型,请删除") return message
def tu_pca(self, dataname="None", components_ratio=None, components_n=None, data=None): if components_n == None: components_n = 1 #返回信息 message = [] #3D图参数 page = Page() range_color = [ '#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf', '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026' ] #测试数据 X, y = make_blobs(n_samples=10000, n_features=3, centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]], cluster_std=[0.2, 0.1, 0.2, 0.2], random_state=9) if data == None: data = X #如果为3维数据,画出该图 if (data.shape[1] == 3): scatter3D = Scatter3D(dataname) scatter3D.add("", X, is_visualmap=True, visual_range_color=range_color) html_name = dataname + ".html" page.add(scatter3D) #pca参数 pca_ = PCA(n_components=data.shape[1]) pca_.fit(data) ratio = pca_.explained_variance_ratio_ variance = pca_.explained_variance_ attr = ["成分" + str(i) for i in range(0, data.shape[1])] pie = Pie("PAC成分图", width=1000, height=600) pie.add( "百分比", attr, ratio, radius=[50, 55], center=[25, 50], is_random=True, ) pie.add( "最大方差", attr, variance, radius=[0, 45], center=[25, 50], rosetype="radius", ) page.add(pie) if components_ratio != None: pca = PCA(n_components=components_ratio) pca.fit(X) value = pca.transform(X) save_helper.save_txt_helper(value, dataname) #信息提示 ratio_ = np.sum(pca.explained_variance_ratio_) if ratio_ > components_ratio: message.append("所选百分比可能过小,为保证充分利用信息可以选择稍微向上调整") #绘图 sum = 0 bar_data = None for i in range(0, data.shape[1]): sum = sum + ratio[i] if sum == ratio_: bar_data = [x for x in ratio[range(i + 1, data.shape[1])]] bar = Bar("剩余成分百分比") bar.add( "", ["剩余成分" + str(i) for i in range(i + 1, data.shape[1])], bar_data) page.add(bar) break else: print(1) pca2 = PCA(n_components=components_n) pca2.fit(X) value = pca2.transform(X) save_helper.save_txt_helper(value, dataname + "2") ratio_ = np.sum(pca2.explained_variance_ratio_) pie_data = [ratio_] + [ x for x in ratio[range(components_n, data.shape[1])] ] attr = ["s"] + [ "s" + str(i) for i in range(components_n, data.shape[1]) ] pie2 = Pie("选择成分百分比") pie2.add("", attr, pie_data, radius=[40, 75], label_text_color=None, is_label_show=True, legend_orient="vertical", legend_pos="right") page.add(pie2) #绘图 save_helper.save_tu_helper(page, dataname) return message
def china_city(self, dataname, data=None): if data == None: data = [(u"海门", 9), (u"鄂尔多斯", 12), (u"招远", 12), (u"舟山", 12), (u"齐齐哈尔", 14), (u"盐城", 15), (u"赤峰", 16), (u"青岛", 18), (u"乳山", 18), (u"金昌", 19), (u"泉州", 21), (u"莱西", 21), (u"日照", 21), (u"胶南", 22), (u"南通", 23), (u"拉萨", 24), (u"云浮", 24), (u"梅州", 25), (u"文登", 25), (u"上海", 25), (u"攀枝花", 25), (u"威海", 25), (u"承德", 25), (u"厦门", 26), (u"汕尾", 26), (u"潮州", 26), (u"丹东", 27), (u"太仓", 27), (u"曲靖", 27), (u"烟台", 28), (u"福州", 29), (u"瓦房店", 30), (u"即墨", 30), (u"抚顺", 31), (u"玉溪", 31), (u"张家口", 31), (u"阳泉", 31), (u"莱州", 32), (u"湖州", 32), (u"汕头", 32), (u"昆山", 33), (u"宁波", 33), (u"湛江", 33), (u"揭阳", 34), (u"荣成", 34), (u"连云港", 35), (u"葫芦岛", 35), (u"常熟", 36), (u"东莞", 36), (u"河源", 36), (u"淮安", 36), (u"泰州", 36), (u"南宁", 37), (u"营口", 37), (u"惠州", 37), (u"江阴", 37), (u"蓬莱", 37), (u"韶关", 38), (u"嘉峪关", 38), (u"广州", 38), (u"延安", 38), (u"太原", 39), (u"清远", 39), (u"中山", 39), (u"昆明", 39), (u"寿光", 40), (u"盘锦", 40), (u"长治", 41), (u"深圳", 41), (u"珠海", 42), (u"宿迁", 43), (u"咸阳", 43), (u"铜川", 44), (u"平度", 44), (u"佛山", 44), (u"海口", 44), (u"江门", 45), (u"章丘", 45), (u"肇庆", 46), (u"大连", 47), (u"临汾", 47), (u"吴江", 47), (u"石嘴山", 49), (u"沈阳", 50), (u"苏州", 50), (u"茂名", 50), (u"嘉兴", 51), (u"长春", 51), (u"胶州", 52), (u"银川", 52), (u"张家港", 52), (u"三门峡", 53), (u"锦州", 54), (u"南昌", 54), (u"柳州", 54), (u"三亚", 54), (u"自贡", 56), (u"吉林", 56), (u"阳江", 57), (u"泸州", 57), (u"西宁", 57), (u"宜宾", 58), (u"呼和浩特", 58), (u"成都", 58), (u"大同", 58), (u"镇江", 59), (u"桂林", 59), (u"张家界", 59), (u"宜兴", 59), (u"北海", 60), (u"西安", 61), (u"金坛", 62), (u"东营", 62), (u"牡丹江", 63), (u"遵义", 63), (u"绍兴", 63), (u"扬州", 64), (u"常州", 64), (u"潍坊", 65), (u"重庆", 66), (u"台州", 67), (u"南京", 67), (u"滨州", 70), (u"贵阳", 71), (u"无锡", 71), (u"本溪", 71), (u"克拉玛依", 72), (u"渭南", 72), (u"马鞍山", 72), (u"宝鸡", 72), (u"焦作", 75), (u"句容", 75), (u"北京", 79), (u"徐州", 79), (u"衡水", 80), (u"包头", 80), (u"绵阳", 80), (u"乌鲁木齐", 84), (u"枣庄", 84), (u"杭州", 84), (u"淄博", 85), (u"鞍山", 86), (u"溧阳", 86), (u"库尔勒", 86), (u"安阳", 90), (u"开封", 90), (u"济南", 92), (u"德阳", 93), (u"温州", 95), (u"九江", 96), (u"邯郸", 98), (u"临安", 99), (u"兰州", 99), (u"沧州", 100), (u"临沂", 103), (u"南充", 104), (u"天津", 105), (u"富阳", 106), (u"泰安", 112), (u"诸暨", 112), (u"郑州", 113), (u"哈尔滨", 114), (u"聊城", 116), (u"芜湖", 117), (u"唐山", 119), (u"平顶山", 119), (u"邢台", 119), (u"德州", 120), (u"济宁", 120), (u"荆州", 127), (u"宜昌", 130), (u"义乌", 132), (u"丽水", 133), (u"洛阳", 134), (u"秦皇岛", 136), (u"株洲", 143), (u"石家庄", 147), (u"莱芜", 148), (u"常德", 152), (u"保定", 153), (u"湘潭", 154), (u"金华", 157), (u"岳阳", 169), (u"长沙", 175), (u"衢州", 177), (u"廊坊", 193), (u"菏泽", 194), (u"合肥", 229), (u"武汉", 273), (u"大庆", 279)] geo = Geo( "全国主要城市空气质量", "data from pm2.5", title_color="#fff", title_pos="center", width=1200, height=600, background_color="#404a59", ) page = Page() page.add(geo) attr, value = geo.cast(data) geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=10, is_visualmap=True) save_helper.save_tu_helper(page, dataname)
def tu_kmeans(self,v=None,n_c=4,dataname="None"): def lable_(ny,num,lable): num=0 for i in lable: for j in range(0,n_c): if i == j: ny[j]=np.vstack((ny[j],np.array(v[num]))) num=num+1 return ny #test data if v==None: v=np.random.random((190,2)) setattr=Scatter("数据平面图") #kmeans kmeans = KMeans(n_clusters=n_c, random_state=9).fit(v) y_pred=kmeans.labels_ print(kmeans.cluster_centers_[1]) setattr.add("center",kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1]) nn={} for i in range(0,n_c): nn[i]=np.array([0,0]) nn=lable_(nn,n_c,y_pred) arr=np.array([[1,3],[2,3],[3,2]]) bar =Bar("方差分析") td=["cul"+str(i) for i in range(0,n_c)] td.append("u") td.append("sum") for i in range(0,n_c): nn[i]=np.delete(nn[i],0,0) setattr.add("cul"+str(i),nn[i][:,0],nn[i][:,1]) def manhattan_distance(x,y): sum=0 for poin in y: sum=np.sum(abs(x-y)) return sum dis={} for i in range(0,n_c): dis[i]=manhattan_distance(kmeans.cluster_centers_[i],nn[i]) dis_list=[dis[i] for i in dis] dis_sum=reduce(lambda x,y:x+y,dis_list) print dis_sum radar=Radar("簇点误差分析.html") #dbscan dis_db=0.1 num_simple=5 dbscan=skc.DBSCAN(dis_db,num_simple).fit(v) n_clusters=len(set(dbscan.labels_))-1 clu_lab=dbscan.labels_ scatter=Scatter("噪声分析") for i in range(n_clusters): print(i) one_clu=v[clu_lab == i] scatter.add("scan"+str(i),one_clu[:,0],one_clu[:,1]) zaosheng=v[dbscan.labels_==-1] if zaosheng != []: scatter.add("噪声点",zaosheng[:,0],zaosheng[:,1]) radar.config([("clu"+str(i),reduce(max,dis_list)) for i in range(0,n_c)]) dis_list.append(dis_sum/n_c) dis_list.append(dis_sum) bar.add("",td,dis_list,is_stack=True,label_pos='inside') radar.add("bia",[dis_list],is_splitline=True, is_axisline_show=True) page=Page() page.add(setattr) page.add(bar) page.add(radar) page.add(scatter) save_helper.save_tu_helper(page,dataname)