def main(): directory = 'D:\\pypy\\pythonresult\\上市企业\\' companies, csvHead = spider_util.readCSV2List(join(directory, '上市企业信息.csv')) for i, company in enumerate(companies): stockType = company['stockType'] StockCode = company['StockCode'] doRequest(StockCode) time.sleep(0.2) spider_util.log_progress(i, len(companies), start_from_zero=True) DataFrame(main_economic_indicators).to_csv(join(directory, '主要经济指标.csv'), header=table_maping[0], index=False) DataFrame(profitability).to_csv(join(directory, '盈利能力.csv'), header=table_maping[1], index=False) DataFrame(solvency).to_csv(join(directory, '偿债能力.csv'), header=table_maping[2], index=False) DataFrame(cost).to_csv(join(directory, '成本费用.csv'), header=table_maping[3], index=False)
def query_location(df: DataFrame): df['号码归属地'] = None length = len(df) for i in range(length): format_tel = df.at[i, '格式化电话'] type = df.at[i, '号码类型'] if format_tel is None or format_tel == '': continue if type == '手机号': text = query_mobile_phone_location(format_tel) else: text = query_telphone_location(format_tel) df.at[i, '号码归属地'] = text spider_util.log_progress(i, length)
def query_location(df: DataFrame): df['号码归属地'] = None length = len(df) # for i in range(length): # format_tel = df.at[i, '格式化电话'] # type = df.at[i, '号码类型'] # if format_tel is None or format_tel == '': # continue # if type == '手机号': # text = query_mobile_phone_location(format_tel) # else: # text = query_telphone_location(format_tel) # df.at[i, '号码归属地'] = text # spider_util.log_progress(i, length) worksheet.write_row(0, 0, ['企业名称', '企业电话', '电话归属地', '格式化后的电话']) for i, row in df.iterrows(): try: index = i + 1 company = row['纳税人名称'] tel = row['财务固话'] format_tel = row['格式化电话'] worksheet.write(index, 0, company) worksheet.write(index, 1, tel) if format_tel is None or format_tel == '': continue while True: try: pic_path = download_pic(format_tel) break except Exception as e: print('发生连接错误,睡眠一段时间后后尝试重新连接') print(repr(e)) time.sleep(120) continue if pic_path is not None and not pic_path == '': worksheet.insert_image(index, 2, pic_path) worksheet.write(index, 3, format_tel) spider_util.log_progress(i, length) sellp_time = random.randint(2, 4) time.sleep(sellp_time) if i >= 2000: break except Exception as e: print('发生异常信息,跳过该号码', repr(e)) continue test_book.close()
def address_format(table,lonField,latField): sql="select * from "+table delete_sql="delete from "+table df=db_util.execute2Dataframe(sql) length=len(df) for i in range(length): lon=df.at[i,lonField] lat = df.at[i, latField] lon=float(lon) lat=float(lat) if lon is None or lon == '' or math.isnan(lon): continue addressComponent=address_standardization.location2normaladdress(lon,lat,coordtype='gcj02ll') street=addressComponent['town'] df.at[i, 'STREET']=street spider_util.log_progress(i,length,detailedLog=True) # db_util.delete(delete_sql) df.to_csv('C:\\Users\\admin\\Desktop\\'+table+'.csv',index=False,sep=',')
def format(): df =db_util.execute2Dataframe('select * from T_OPEN_SGXKZXX ') dflen=len(df.index)#总行数 for x in range(dflen): addr = df['CONST_LOCATION'].iloc[x] try: addressComponent=address_standardization.formatAddress(addr) df.set_value(x, 'QU', addressComponent['district']) df.set_value(x, 'STREET', addressComponent['town']) df.set_value(x, 'DL', addressComponent['street']) df.set_value(x, 'BD_X', addressComponent['bd_x']) df.set_value(x, 'BD_Y', addressComponent['bd_y']) df.set_value(x, 'LON84', addressComponent['lon84']) df.set_value(x, 'LAT84', addressComponent['lat84']) except Exception as e: print('地址转换错误:',addr,e) spider_util.log_progress(x,dflen) print(df) df.to_excel('D:\\011111111111111111111111\\00临时文件\\T_OPEN_SGXKZXX.xlsx', index=False)
def query_location2(df: DataFrame): df['号码归属地'] = None length = len(df) for i in range(length): format_tel = df.at[i, '格式化电话'] if format_tel is None or format_tel == '': continue url = 'https://www.00cha.com/114.asp?t=' + format_tel bsObj = spider_util.open_url_return_bsobj( url, 5, 20, from_encoding='gbk' ) # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252 tags = bsObj.find_all('font', {'size': 4}) if tags is None: tags = bsObj.find_all('font', {'color': '#008080'}) text = None for tag in tags: text = text + ' ' + tag.get_text().strip() if format_tel.startswith('0769'): text = '广东 东莞' df.at[i, '号码归属地'] = text spider_util.log_progress(i, length)
def get_list(): data = [] r = RedisUtil().get_redis_instance() for i in range(1, 2000): url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format( page=i) # head = ['职位名', '公司名', '工作地点', '薪资', '发布时间', '职位详细URL'] bsobj = spider_util.open_url_return_bsobj(url) div_tags = bsobj.select('#resultList .el')[1:] for div in div_tags: job = div.select_one('a').get_text().strip() job_url = div.select_one('a').get('href') redis_result = r.sadd(redis_job_set, job_url) if redis_result == 0: # 结果为0,则添加失败,说明已经有该职位url信息 continue company = div.select_one('.t2 a').get_text().strip() address = div.select_one('.t3').get_text().strip() salary = div.select_one('.t4').get_text().strip() money_toplimit = None money_lowerlimit = None money_unit = None time_unit = None if '/' in salary: money_range = salary.split('/')[0] money_unit = money_range[-1] money_range = money_range[:-1] money_toplimit = money_range money_lowerlimit = money_range if '-' in money_range: # 分割薪水上下限 money_lowerlimit = money_range.split('-')[0] money_toplimit = money_range.split('-')[1] time_unit = salary.split('/')[1] push_time = div.select_one('.t5').get_text().strip() item = {'职位名': job, '公司名': company, '工作地点': address, '薪资': salary, '发布时间': push_time, '职位详细URL': job_url, '金额上限': money_toplimit, '金额下限': money_lowerlimit, '时间单位': time_unit, '金额单位': money_unit} data.append(item) spider_util.log_progress(i,2000,start_from_zero=False) return DataFrame(data)
def change_zb(filename): with open(filename, "r", encoding='utf-8', newline='') as file: header={'Cookie':'BCE54B84-5407-41FD-9D16-C8A09E5DA2A0=YWRtaW4%3D; YWRtaW4==a2RpZiNzaWM4RGpbY216; JSESSIONID=1BA5932F6535DFDEAA2E63C9AAD3040C'} url='http://10.169.11.195:7020/tjfxpt/gis/local2wgs.xhtml' df = pd.read_csv(file, dtype=str) length=len(df.index) df['bd_x'] = None df['bd_y'] = None df['gd_x'] = None df['gd_y'] = None for x in range(len(df.index)): try: ABSX=df['LOG'].iloc[x] ABSY=df['LAT'].iloc[x] if ABSY is None or ABSY is None: continue ABSX = float(ABSX) ABSY = float(ABSY) if math.isnan(ABSX) or math.isnan(ABSY):#非数字跳过 continue html = spider_util.open_url(url, data={'lng': ABSX, 'lat': ABSY},header=header) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = float(zb_arr[0]) # 百度经度坐标 lat = float(zb_arr[1]) df.set_value(x, 'bd_x', lon) df.set_value(x, 'bd_y', lat) gcj02=coordinate_util.bd09togcj02(lon,lat)#百度坐标转火星坐标 df.set_value(x, 'gd_x', gcj02[0]) df.set_value(x, 'gd_y', gcj02[1]) spider_util.log_progress(x,length,start_from_zero=True,detailedLog=True) # print(jd84+'-----'+wd84 # time.sleep(0.04) except Exception as e: print('跳过该条数据') df.to_csv(filename, index=False, sep=',') print(df)