def request_area_personcont(school: str, schooltype: str, buildings: []):
    num = int(len(buildings) / 999) + 1
    split_buildings = np.array_split(buildings, num)
    hrjk_dict = {}
    ldrk_dict = {}
    hrjk_list = []
    ldrk_list = []
    for split_building in split_buildings:
        try:
            hjrk_sql = "SELECT age,sum(num) NUM,RKXZ FROM FT_RKJZT WHERE lddm IN(%s) AND age <18 AND AGE >=0 AND rkxz='深圳户籍人口'  GROUP BY AGE,RKXZ"
            ldrk_sql = "SELECT age,sum(num) NUM,RKXZ FROM FT_RKJZT WHERE lddm IN(%s) AND age <18 AND AGE >=0 AND rkxz='流动人口'  GROUP BY AGE,RKXZ"
            in_p = ', '.join(list(map(lambda x: "'%s'" % x, split_building)))
            hjrk_sql = hjrk_sql % in_p
            ldrk_sql = ldrk_sql % in_p
            hrjk_df = db_util.execute2Dataframe(hjrk_sql)
            ldrk_df = db_util.execute2Dataframe(ldrk_sql)
            for index, row in hrjk_df.iterrows():
                age = row['AGE']
                num = row['NUM']
                if not hrjk_dict.get(age):
                    hrjk_dict[age] = 0
                hrjk_dict[age] = hrjk_dict[age] + num
            for index, row in ldrk_df.iterrows():
                age = row['AGE']
                num = row['NUM']
                if not ldrk_dict.get(age):
                    ldrk_dict[age] = 0
                ldrk_dict[age] = ldrk_dict[age] + num
        except Exception as e:
            print(e)
    for age in hrjk_dict:
        hrjk_list.append({
            'AGE': age,
            'NUM': hrjk_dict[age],
            'RKXZ': '深圳户籍人口',
            'SCHOOLNAME': school,
            'SCHOOLTYPE': schooltype
        })
    for age in ldrk_dict:
        ldrk_list.append({
            'AGE': age,
            'NUM': ldrk_dict[age],
            'RKXZ': '流动人口',
            'SCHOOLNAME': school,
            'SCHOOLTYPE': schooltype
        })
    hrjk_list.extend(ldrk_list)
    return DataFrame(hrjk_list)
예제 #2
0
def fenci():
	sql="""
	select qymc,sqnr,dfnr from ENT_REQUIRENTMENT
	"""
	df = db_util.execute2Dataframe(sql)
	# df = DataFrame(pd.read_excel("E:\\svn仓库\\svnrepo\\python\\03data\\data\\各委办局直接过来的数据\\外迁企业相关数据\\企业诉求\\2018-4.xlsx"))
	wordsCount = {}
	wordsList = []
	stopwords = get_stopWords()
	for index, row in df.iterrows():

		qymc = row['QYMC']
		question = row['SQNR']
		answer = row['DFNR']

		seg_list = jieba.cut(question, cut_all=True)
		for word in seg_list:
			if word.strip() == '':
				continue
			if word in stopwords:
				continue
			if wordsCount.get(word) is None:
				wordsCount[word] = 0
			wordsCount[word] += 1
	for k, v in wordsCount.items():
		wordsList.append({'单词': k, '出现次数': v})
	result = DataFrame(wordsList).sort_values(by='出现次数', na_position='first')
	gen_word_cloud(wordsCount)
	print(result)
def request_area_building():
    """
	获取学区图中每个学区的0-18岁人口信息
	:return: DataFrame
	"""
    file = 'D:\\pypy\\pythonresult\\教育学位\\学校人口信息.xls'
    if os.path.isfile(file):
        area_data = DataFrame(pd.read_excel(file))
        if area_data is not None or not area_data.empty:
            return area_data
    areas = db_util.execute2Dataframe('SELECT\
				WWYJFX.T_JY_SCHOOLAREA.SCHOOLNAME,\
				WWYJFX.T_JY_SCHOOLAREA.SCHOOL_FULLNAME,\
				WWYJFX.T_JY_SCHOOLAREA.SCHOOLTYPE,\
				WWYJFX.T_JY_SCHOOLAREA.POLYGON_84\
				FROM\
				WWYJFX.T_JY_SCHOOLAREA\
			')
    # areas = DataFrame(pd.read_excel('D:\\pypy\\pythonresult\\教育学位\\学区信息.xls'))
    data = {
        'f': 'json',
        'returnGeometry': 'false',
        'spatialRel': 'esriSpatialRelIntersects',
        'geometryType': 'esriGeometryPolygon',
        'inSR': 4490,
        'outFields': 'BLDG_NO,NOWNAME',
        'outSR': 4490
    }
    url_prefox = 'http://10.190.55.55:8080/arcgis/rest/services/FTKSJ/JZWDLM_CGCS2000/MapServer/1/query'
    person_data = DataFrame()
    for index, row in areas.iterrows():
        polygon_84 = row['POLYGON_84']
        schoolname = row['SCHOOLNAME']
        schooltype = row['SCHOOLTYPE']
        if polygon_84 is not None and polygon_84 is not '' and polygon_84 is not np.nan:
            geometry = split_point_to_geometry(polygon_84)
            data['geometry'] = geometry
            result = spider_util.open_url(url_prefox, 5, 20,
                                          data=data)  # 20秒超时
            jsondata = demjson.decode(result)
            buildings = get_building(jsondata)
            if buildings is None or len(buildings) == 0:
                print('该学校:' + schoolname + '楼栋id为空')
                continue
            childinfo = request_area_personcont(schoolname, schooltype,
                                                buildings)
            person_data = person_data.append(childinfo)
    df = DataFrame(person_data)
    df.to_excel(file, index=False)
    return df
예제 #4
0
def address_format(table,lonField,latField):
	sql="select * from "+table
	delete_sql="delete from "+table
	df=db_util.execute2Dataframe(sql)
	length=len(df)
	for i in range(length):
		lon=df.at[i,lonField]
		lat = df.at[i, latField]
		lon=float(lon)
		lat=float(lat)
		if lon is None or lon == '' or math.isnan(lon):
			continue
		addressComponent=address_standardization.location2normaladdress(lon,lat,coordtype='gcj02ll')
		street=addressComponent['town']
		df.at[i, 'STREET']=street
		spider_util.log_progress(i,length,detailedLog=True)
	# db_util.delete(delete_sql)
	df.to_csv('C:\\Users\\admin\\Desktop\\'+table+'.csv',index=False,sep=',')
def format():
	df =db_util.execute2Dataframe('select * from T_OPEN_SGXKZXX ')
	dflen=len(df.index)#总行数
	for x in range(dflen):
		addr = df['CONST_LOCATION'].iloc[x]
		try:
			addressComponent=address_standardization.formatAddress(addr)

			df.set_value(x, 'QU', addressComponent['district'])
			df.set_value(x, 'STREET', addressComponent['town'])
			df.set_value(x, 'DL', addressComponent['street'])
			df.set_value(x, 'BD_X', addressComponent['bd_x'])
			df.set_value(x, 'BD_Y', addressComponent['bd_y'])
			df.set_value(x, 'LON84', addressComponent['lon84'])
			df.set_value(x, 'LAT84', addressComponent['lat84'])
		except Exception as e:
			print('地址转换错误:',addr,e)
		spider_util.log_progress(x,dflen)
	print(df)
	df.to_excel('D:\\011111111111111111111111\\00临时文件\\T_OPEN_SGXKZXX.xlsx', index=False)
예제 #6
0
def loadSimpDat():
	# simpDat = [['r', 'z', 'h', 'j', 'p'],
	#            ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
	#            ['z'],
	#            ['r', 'x', 'n', 'o', 's'],
	#            ['y', 'r', 'x', 'z', 'q', 't', 'p'],
	#            ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]

	# sql = """
	#    select a.qymc,a.sqnr,a.dfnr,b.hydm from ENT_REQUIRENTMENT a left join T_SJZX_SSZTJBXX b on a.qymc=b.qymc
	#    """
	sql="""
	select a.qymc,a.sqnr,a.dfnr,c.GBHY from ENT_REQUIRENTMENT a inner join T_YW_ZZ_FR b on a.qymc=b.jgmc 
		inner join OPENDATA_SY_INFO c on b.TYSHXYDM =c.TYSHXYDM
	"""
	simpDat=[]
	df = db_util.execute2Dataframe(sql)
	# df = DataFrame(pd.read_excel("E:\\svn仓库\\svnrepo\\python\\03data\\data\\各委办局直接过来的数据\\外迁企业相关数据\\企业诉求\\2018-4.xlsx"))
	wordsCount = {}
	wordsList = []
	stopwords = get_stopWords()
	# 引入TF-IDF关键词抽取接口
	textrank = analyse.textrank

	for index, row in df.iterrows():
		hydm = row['GBHY']
		qymc = row['QYMC']
		question = row['SQNR']
		answer = row['DFNR']
		# 基于TF-IDF算法进行关键词抽取
		keywords = textrank(question,topK=10)
		print('关键词:')
		# 输出抽取出的关键词
		print('/'.join(keywords))
		seg_list = jieba.cut(question, cut_all=True)
		keywordsFilter=[]
		for word in keywords:
			if word.strip() == '':
				continue
			if word in stopwords:
				continue
			keywordsFilter.append(word)
		if hydm is not None:
			keywordsFilter.append(hydm)
			simpDat.append(keywordsFilter)
		for word in seg_list:
			if word.strip() == '':
				continue
			if word in stopwords:
				continue
			if wordsCount.get(word) is None:
				wordsCount[word] = 0
			wordsCount[word] += 1
	for k, v in wordsCount.items():
		wordsList.append({'单词': k, '出现次数': v})
	result = DataFrame(wordsList).sort_values(by='出现次数', na_position='first')
	gen_word_cloud(wordsCount)

	top100Df=result[-20:]
	top100List=top100Df['单词'].tolist()

	# for index, row in df.iterrows():
	# 	keywords = []
	# 	hydm = row['GBHY']
	# 	question = row['SQNR']
	# 	answer = row['DFNR']
	# 	seg_list = jieba.cut(question, cut_all=True)
	# 	for word in seg_list:
	# 		if word.strip() == '':
	# 			continue
	# 		if word in stopwords:
	# 			continue
	# 		if word in top100List:
	# 			if word in keywords:
	# 				continue
	# 			keywords.append(word)
	# 	if len(keywords)!=0 and hydm is not None:
	# 		keywords.append(hydm)
	# 		simpDat.append(keywords)
	return simpDat
예제 #7
0
def baseinfo():
	sql = """
	SELECT DISTINCT a.DWMC ,b.cjzs,b.zjzs,c.total_wqzs,d.tdjycs,d.zjtdjyrq,
	CASE WHEN e.stockcode IS NOT NULL THEN '已上市' ELSE '' END AS isListed,
	e.stockcode,e.stockname,e.companylistingdate,e.phone,e.employeenum,
	CASE WHEN f.qymc IS NOT NULL THEN '准备上市' ELSE '' END AS PREPARELIST,
	f.*
	FROM ENTERPRISE_INFO_ZDGZ  a
	LEFT JOIN 
	(SELECT qymc,sum(cjzs) cjzs,sum(zjzs) zjzs FROM  ENT_DX_CZJXX GROUP BY qymc) b ON a.dwmc=b.qymc
	LEFT JOIN
	(SELECT qymc,sum(TOTAL_WQZS) TOTAL_WQZS FROM  ENT_DX_WQXX GROUP BY qymc ) c ON a.dwmc=c.qymc
	LEFT JOIN 
	(SELECT gsmc,count(*) TDJYCS,max(jyrq) ZJTDJYRQ FROM LAND_EXCHANGE GROUP BY gsmc)d ON a.dwmc=d.gsmc
	LEFT JOIN ent_listed_company e ON a.dwmc=e.COMPANYNAME
	LEFT JOIN ENT_IPO f ON a.dwmc=f.qymc
	WHERE b.cjzs IS NOT NULL OR b.zjzs IS NOT NULL OR c.total_wqzs IS NOT NULL OR d.tdjycs IS NOT NULL 
	OR d.zjtdjyrq IS NOT NULL OR f.qymc IS NOT NULL
	"""
	df = db_util.execute2Dataframe(sql)

	# 统计公司社保缴纳情况
	# s = """
	# select a.dwmc,c.YJNY,count(*) JNRS from ENTERPRISE_INFO_ZDGZ a inner join T_SJZX_RKSBXX b
	# on a.dwmc=b.UNIT_NAME inner JOIN T_SJZX_SBMXXX_2017TO2018 c on b.SI_NO=c.shbxh
	# GROUP BY a.dwmc,c.YJNY
	# order by c.YJNY
	# """
	s = """
	SELECT
		a.dwmc ,yjny,count(*) JNRS
	FROM
		ENTERPRISE_INFO_ZDGZ A
	INNER JOIN LGL_UNITSOCIAL_SECURITY b ON A .dwmc = b.DWMC
	INNER JOIN T_SJZX_SBMXXX_2017TO2018 c ON b.dwbm=c.sbdwbh
	GROUP BY a.dwmc,yjny
	ORDER BY yjny
	"""

	qysb_df = db_util.execute2Dataframe(s)
	qysb_dict = {}
	for index, row in qysb_df.iterrows():
		dwmc = row['DWMC']

		dw_data = qysb_dict.get(dwmc)
		if dw_data is None:
			dw_data = []
			qysb_dict[dwmc] = dw_data
		dw_data.append(row['JNRS'])

	company_list = []
	for k, v in qysb_dict.items():
		v = v[:-1]
		if len(v) == 0:
			continue
		v_serise = pd.Series(v)
		std = v_serise.std()
		mean = v_serise.mean()
		entent = '上升'
		isdown = False
		if v[-1] - mean < 0:
			entent = '下降'
			isdown = True
		if v[-1] - mean == 0:
			entent = '不变'
		cov = std / mean
		# if isdown and cov>0.1 and mean>10:

		company_data = {'DWMC': k, 'SBJNRS': round(mean), 'ZJFD': entent, 'cov': cov}
		company_list.append(company_data)
		if isdown:
			print(k, '数据:', v, '标准差:', std, '均值:', mean, '变异系数:', cov, '增减幅度:', entent)

	company_df = DataFrame(company_list)
	merge_df = df.merge(company_df, how='left', left_on='DWMC', right_on='DWMC')
	print(merge_df)

	merge_df.to_excel('D:\\python\\企业社保信息.xlsx')
예제 #8
0
	# 			if word in keywords:
	# 				continue
	# 			keywords.append(word)
	# 	if len(keywords)!=0 and hydm is not None:
	# 		keywords.append(hydm)
	# 		simpDat.append(keywords)
	return simpDat




if __name__ == "__main__":
	sql="""
	select GBHY from OPENDATA_SY_INFO GROUP BY gbhy 
	"""
	df = db_util.execute2Dataframe(sql)
	gbhylist=df["GBHY"].tolist()
	dataSet=loadSimpDat()
	L, suppData = AssociationRulesUtil.apriori(dataSet, minSupport=0.02)
	print('频繁项:',L)

	filterItems=[]
	for items in L:
		for frozenset in items:
			for item in frozenset:
				if len(frozenset)>1 and item in gbhylist:
					filterItems.append(frozenset)
					break
	print('过滤项',filterItems)
	# main()