def get_citycode(): if os.path.exists("./去哪儿网城市编码.xlsx"): citycode=pd.read_excel("./去哪儿网城市编码.xlsx") else: # 获取网页数据 url="https://travel.qunar.com/place/" content=get_html(url,d_c=cookies) # 解析 citycode=pd.DataFrame(get_citydata(content)) # 保存文件 citycode.to_excel("./去哪儿网城市编码.xlsx",index=False) print(">> 已将城市代码保存至本地.") return citycode
if __name__=="__main__": cityname="东莞" cookies="QN1=ezu0qF02oOtLLIGnE+84Ag==; QN205=organic; QN277=organic; _i=VInJOW3UnofqCZzxZx5MgIrJ_LJq; QN269=3D52BA51AD0E11E9B3C2FA163E72396C; Hm_lvt_c56a2b5278263aa647778d304009eafc=1563861232; fid=5d9fa9b8-8e57-4997-8bde-b56d832bbe8e; QN25=891ada9c-95c6-4e0e-9610-985e70868ba2-9f992f90; QN42=blyh4457; _q=U.pljqewu7871; _t=26173796; csrfToken=0kDhwxTekjtQp7cgwHUcODFphnnricSO; _s=s_YBKAEJ3QQXNZVIAO6EVPLXZHYQ; _v=AE723kHu7Q-xLQuOqnsHYRQIOX1Hsi2ROUduJTZkfwDbLMeRs-WN75Smkc_ExAXGZOXxHxGj7oUCuIuhswFtnPh-REznJt2Aw0HuKmSSk_IyRaiAYb9sCcnZlaRsA9frKIl3z0-oMPW5FiFhn8FpcTzY8-_3_NLfkM5QcQD4GfSo; QN44=pljqewu7871; QN48=tc_47403cdd02d5b2e6_16c1d66b463_48d5; QN99=1870; QN300=organic; QunarGlobal=10.86.213.150_47afbcc5_16c1d64a5f9_-41a7|1563861389127; QN601=828f3bfeea4088d0e6f16dafc7dd5202; QN163=0; QN667=B; quinn=529dec6368206af22e224d559f84eff115602568b4b4f17168530d3d23856ba418f7699858027462d408925c769fe5c1; activityClose=1; QN100=WyLkuIrmtbd8Il0%3D; QN243=12; QN57=15638615782980.36575468791369037; QN58=1563861578295%7C1563861578295%7C1; QN5=qunar_djmp_gnmdd_%E4%B8%8A%E6%B5%B7; viewdist=299878-8|299914-1|300195-3; uld=1-300195-3-1563862578|1-299914-1-1563862215|1-299878-8-1563862200; QN267=05269388050307847e; _vi=CAT-Ndpyg17SE1skcsEfF15jNZNIabnBugntln1LuvV-sfJK-XLGMRgN4RiehpxJaQdWd0RvVmVQUeFxm-xuFquqfg5QhMR3qxJh56P1Xn38ig3xG-5RawJm8tnsw-oQwptbAGQn7B3ASjzwIvMMf4_pstJ0Nj5Cq4yxSnHMyiBm; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1563862582; QN271=e159d5b6-a2bb-479c-828b-d1e335604d08" # 获取城市列表 citycode=get_citycode() # 构造需要爬取的数据链接:默认为上海市20页数据 beginurls=get_urls(citycode,cityname=cityname,n=50) # 采集分页网页信息 infos=[] errorurl=[] # 存放异常网页 for page,ui in enumerate(beginurls): time1=time.time() try: content=get_html(ui,d_c=cookies) infos.extend(get_data(content)) time2=time.time() print(">> 数据采集成功,总共获取%i条数据,用时%.2fs"%(len(infos),time2-time1)) except: print(">> 数据采集失败, 数据网址为:",ui) #print("网页内容:\n",content) errorurl.append(ui) time.sleep(random.random()*10) time.sleep(random.random()) # 整理数据 data=pd.DataFrame(infos) # 清洗数据 for i in ["评分","排名","攻略提到数量","点评数量"]: data[i].fillna(0,inplace=True)
picdic={} picdic["picname"]="%s%d-%d"%(title,page,i+1) picdic["picsrc"]=li.find("img")["src"] lst.append(picdic) return lst if __name__=="__main__": beginurls=get_urls() # 采集分页网页信息 infos=[] errorurl=[] # 存放异常网页 for page,ui in enumerate(beginurls): time1=time.time() try: content=get_html(ui) infos.extend(get_data("哪吒之魔童降世",page+1,content)) time2=time.time() print(">> 数据采集成功,总共获取%i条数据,用时%.2fs"%(len(infos),time2-time1)) except: print(">> 数据采集失败, 数据网址为:",ui) #print("网页内容:\n",content) errorurl.append(ui) time.sleep(random.random()*10) time.sleep(random.random()) # 保存图片 n=1 for picdic in infos: time1=time.time() try: