def main():
	directory = 'D:\\pypy\\pythonresult\\上市企业\\'
	companies, csvHead = spider_util.readCSV2List(join(directory, '上市企业信息.csv'))
	for i, company in enumerate(companies):
		stockType = company['stockType']
		StockCode = company['StockCode']
		doRequest(StockCode)
		time.sleep(0.2)
		spider_util.log_progress(i, len(companies), start_from_zero=True)
	DataFrame(main_economic_indicators).to_csv(join(directory, '主要经济指标.csv'), header=table_maping[0], index=False)
	DataFrame(profitability).to_csv(join(directory, '盈利能力.csv'), header=table_maping[1], index=False)
	DataFrame(solvency).to_csv(join(directory, '偿债能力.csv'), header=table_maping[2], index=False)
	DataFrame(cost).to_csv(join(directory, '成本费用.csv'), header=table_maping[3], index=False)
Пример #2
0
def query_location(df: DataFrame):
    df['号码归属地'] = None
    length = len(df)
    for i in range(length):
        format_tel = df.at[i, '格式化电话']
        type = df.at[i, '号码类型']
        if format_tel is None or format_tel == '':
            continue
        if type == '手机号':
            text = query_mobile_phone_location(format_tel)
        else:
            text = query_telphone_location(format_tel)
        df.at[i, '号码归属地'] = text
        spider_util.log_progress(i, length)
def query_location(df: DataFrame):
	df['号码归属地'] = None
	length = len(df)
	# for i in range(length):
	# 	format_tel = df.at[i, '格式化电话']
	# 	type = df.at[i, '号码类型']
	# 	if format_tel is None or format_tel == '':
	# 		continue
	# 	if type == '手机号':
	# 		text = query_mobile_phone_location(format_tel)
	# 	else:
	# 		text = query_telphone_location(format_tel)
	# 	df.at[i, '号码归属地'] = text
	# 	spider_util.log_progress(i, length)
	worksheet.write_row(0, 0, ['企业名称', '企业电话', '电话归属地', '格式化后的电话'])
	for i, row in df.iterrows():
		try:
			index = i + 1
			company = row['纳税人名称']
			tel = row['财务固话']
			format_tel = row['格式化电话']
			worksheet.write(index, 0, company)
			worksheet.write(index, 1, tel)
			if format_tel is None or format_tel == '':
				continue
			while True:
				try:
					pic_path = download_pic(format_tel)
					break
				except Exception as e:
					print('发生连接错误,睡眠一段时间后后尝试重新连接')
					print(repr(e))
					time.sleep(120)
					continue
			if pic_path is not None and not pic_path == '':
				worksheet.insert_image(index, 2, pic_path)
			worksheet.write(index, 3, format_tel)
			spider_util.log_progress(i, length)
			sellp_time = random.randint(2, 4)
			time.sleep(sellp_time)
			if i >= 2000:
				break
		except Exception as e:
			print('发生异常信息,跳过该号码', repr(e))
			continue

	test_book.close()
Пример #4
0
def address_format(table,lonField,latField):
	sql="select * from "+table
	delete_sql="delete from "+table
	df=db_util.execute2Dataframe(sql)
	length=len(df)
	for i in range(length):
		lon=df.at[i,lonField]
		lat = df.at[i, latField]
		lon=float(lon)
		lat=float(lat)
		if lon is None or lon == '' or math.isnan(lon):
			continue
		addressComponent=address_standardization.location2normaladdress(lon,lat,coordtype='gcj02ll')
		street=addressComponent['town']
		df.at[i, 'STREET']=street
		spider_util.log_progress(i,length,detailedLog=True)
	# db_util.delete(delete_sql)
	df.to_csv('C:\\Users\\admin\\Desktop\\'+table+'.csv',index=False,sep=',')
def format():
	df =db_util.execute2Dataframe('select * from T_OPEN_SGXKZXX ')
	dflen=len(df.index)#总行数
	for x in range(dflen):
		addr = df['CONST_LOCATION'].iloc[x]
		try:
			addressComponent=address_standardization.formatAddress(addr)

			df.set_value(x, 'QU', addressComponent['district'])
			df.set_value(x, 'STREET', addressComponent['town'])
			df.set_value(x, 'DL', addressComponent['street'])
			df.set_value(x, 'BD_X', addressComponent['bd_x'])
			df.set_value(x, 'BD_Y', addressComponent['bd_y'])
			df.set_value(x, 'LON84', addressComponent['lon84'])
			df.set_value(x, 'LAT84', addressComponent['lat84'])
		except Exception as e:
			print('地址转换错误:',addr,e)
		spider_util.log_progress(x,dflen)
	print(df)
	df.to_excel('D:\\011111111111111111111111\\00临时文件\\T_OPEN_SGXKZXX.xlsx', index=False)
Пример #6
0
def query_location2(df: DataFrame):
    df['号码归属地'] = None
    length = len(df)
    for i in range(length):
        format_tel = df.at[i, '格式化电话']
        if format_tel is None or format_tel == '':
            continue
        url = 'https://www.00cha.com/114.asp?t=' + format_tel
        bsObj = spider_util.open_url_return_bsobj(
            url, 5, 20, from_encoding='gbk'
        )  # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252
        tags = bsObj.find_all('font', {'size': 4})
        if tags is None:
            tags = bsObj.find_all('font', {'color': '#008080'})
        text = None
        for tag in tags:
            text = text + ' ' + tag.get_text().strip()
        if format_tel.startswith('0769'):
            text = '广东 东莞'
        df.at[i, '号码归属地'] = text
        spider_util.log_progress(i, length)
Пример #7
0
def get_list():
	data = []
	r = RedisUtil().get_redis_instance()
	for i in range(1, 2000):
		url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
			page=i)
		# head = ['职位名', '公司名', '工作地点', '薪资', '发布时间', '职位详细URL']
		bsobj = spider_util.open_url_return_bsobj(url)
		div_tags = bsobj.select('#resultList .el')[1:]
		for div in div_tags:
			job = div.select_one('a').get_text().strip()
			job_url = div.select_one('a').get('href')
			redis_result = r.sadd(redis_job_set, job_url)
			if redis_result == 0:  # 结果为0,则添加失败,说明已经有该职位url信息
				continue
			company = div.select_one('.t2 a').get_text().strip()
			address = div.select_one('.t3').get_text().strip()
			salary = div.select_one('.t4').get_text().strip()
			money_toplimit = None
			money_lowerlimit = None
			money_unit = None
			time_unit = None
			if '/' in salary:
				money_range = salary.split('/')[0]
				money_unit = money_range[-1]
				money_range = money_range[:-1]
				money_toplimit = money_range
				money_lowerlimit = money_range
				if '-' in money_range:  # 分割薪水上下限
					money_lowerlimit = money_range.split('-')[0]
					money_toplimit = money_range.split('-')[1]
				time_unit = salary.split('/')[1]
			push_time = div.select_one('.t5').get_text().strip()
			item = {'职位名': job, '公司名': company, '工作地点': address, '薪资': salary, '发布时间': push_time, '职位详细URL': job_url,
					'金额上限': money_toplimit, '金额下限': money_lowerlimit, '时间单位': time_unit, '金额单位': money_unit}
			data.append(item)
		spider_util.log_progress(i,2000,start_from_zero=False)

	return DataFrame(data)
Пример #8
0
def change_zb(filename):
	with open(filename, "r", encoding='utf-8', newline='') as file:
		header={'Cookie':'BCE54B84-5407-41FD-9D16-C8A09E5DA2A0=YWRtaW4%3D; YWRtaW4==a2RpZiNzaWM4RGpbY216; JSESSIONID=1BA5932F6535DFDEAA2E63C9AAD3040C'}
		url='http://10.169.11.195:7020/tjfxpt/gis/local2wgs.xhtml'
		df = pd.read_csv(file, dtype=str)
		length=len(df.index)
		df['bd_x'] = None
		df['bd_y'] = None
		df['gd_x'] = None
		df['gd_y'] = None
		for x in range(len(df.index)):
			try:
				ABSX=df['LOG'].iloc[x]
				ABSY=df['LAT'].iloc[x]
				if ABSY is None or ABSY is None:
					continue
				ABSX = float(ABSX)
				ABSY = float(ABSY)
				if math.isnan(ABSX) or math.isnan(ABSY):#非数字跳过
					continue
				html = spider_util.open_url(url, data={'lng': ABSX, 'lat': ABSY},header=header)
				bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
				zb = bsObj.get_text().strip()
				zb_arr = zb.split(',')
				lon = float(zb_arr[0])  # 百度经度坐标
				lat = float(zb_arr[1])
				df.set_value(x, 'bd_x', lon)
				df.set_value(x, 'bd_y', lat)
				gcj02=coordinate_util.bd09togcj02(lon,lat)#百度坐标转火星坐标
				df.set_value(x, 'gd_x', gcj02[0])
				df.set_value(x, 'gd_y', gcj02[1])
				spider_util.log_progress(x,length,start_from_zero=True,detailedLog=True)
				# print(jd84+'-----'+wd84
				# time.sleep(0.04)
			except Exception as e:
					print('跳过该条数据')
		df.to_csv(filename, index=False, sep=',')
		print(df)