def url(): url = "http://music.taihe.com/" response = requests.get(url, headers=sj_User_Agent()) return response.text # print(url()) # with open("音乐.html", "w", encoding="utf-8")as file: # file = file.write(response.text)
def wai(): while True: try: url = "https://erds.58.com/zufang/" response = requests.get(url, headers=sj_User_Agent()) print(response) with open("租房.html", "w", encoding="utf-8") as file: return response.text break except Exception as e: print("这个请求头不可用!!!%s" % (e))
def w(): while True: try: url = "http://sc.chinaz.com/" header = sj_User_Agent() response = requests.get(url, headers=header) # print(response) response.encoding = "utf-8" # with open("素材.html", "w", encoding="utf-8")as file: # file = file.write(response.text) return response.text break except Exception as e: print("这个请求头不可用!!!")
cursor = db.cursor() list = [] t = 0 for j in range(1,30): print("第%s页开始"%(j)) yifile = etree.HTML(wai()) yifile = yifile.xpath('//li[@class="house-cell"]/div[@class="img-list"]/a/@href') print(yifile,len(yifile)) for i in yifile: # print(i) while True: sj = random.randint(1,5) time.sleep(sj) try: murl = i erresponse = requests.get(murl, headers=sj_User_Agent()) erresponse.encoding = "utf-8" print(erresponse) erresponse = erresponse.text # with open("3.html", "w", encoding="utf-8")as filel: # filel = filel.write(erresponse) break except Exception as e: print("这个<二级>请求头不可用!!!%s"%(e)) # with open("3.html" , "w" , encoding="utf-8")as filel: # filel = filel.write(erresponse) t += 1 # try: z(filename, erresponse, t) # except Exception as e: # print("访问频繁%s"%(e))
def we(j): filename = "素材.csv" t = 0 for m in range(len(url_l(w())[0])): t += 1 s = "html/" + url_l(w())[1][m] + ".html" # print(s) # with open(s, "r", encoding="utf-8")as file: # mfile = file.read() sj = random.randint(1, 3) time.sleep(sj) urll = url_l(w())[0][m] mresponse = requests.get(urll, headers=sj_User_Agent()) mresponse.encoding = "utf-8" mfile = etree.HTML(mresponse.text) def hh(m): if m + 1 == 1: # 矢量 sl_url = mfile.xpath( '//div//div[@class="text_left"]//div[@class="box picblock col3"]/div/a/img/@src2' ) # for m in sl_url: # return m return sl_url elif m + 1 == 2: # 高清图片 tp_url = mfile.xpath( '//div[@id="container"]/div[@class="box picblock col3"]/div/a/img/@src2' ) # for m in tp_url: # return m return tp_url elif m + 1 == 3: # 图标 tb_url = mfile.xpath( '//ul[@class="pngblock imgload"]/li/p/a/img/@src2') # for m in tb_url: # return m return tb_url elif m + 1 == 4: # PSD素材 psd_url = mfile.xpath( '//div[@class="box col3 ws_block"]/a/img/@src') # for m in psd_url: # return m return psd_url elif m + 1 == 5: # 字体 zt_url = mfile.xpath( '//div//div[@class="index_font_list clearfix"]//li[@class="font"]/div/a/img/@src' ) # for m in zt_url: # return m return zt_url elif m + 1 == 6: # 英文字体 ywzt_url = mfile.xpath('//li[@class="font"]/div/a/img/@src') # for m in ywzt_url: # return m return ywzt_url elif m + 1 == 7: # 音效 yx_url = mfile.xpath( '//div[@class="music_block"]/p[@class="n1"]/@thumb') # for m in yx_url: # return m return yx_url elif m + 1 == 8: # PPT模板 ppt_url = mfile.xpath( '//div[@class="sc_warp mt20"]/div[@id="main"]/div/div/a/img/@src' ) # for m in ppt_url: # return m return ppt_url elif m + 1 == 9: # 简历模板 jl_url = mfile.xpath( '//div[@class="sc_warp mt20"]/div[@id="main"]/div/div/a/img/@src' ) # for m in jl_url: # return m return jl_url else: pass dic = { '矢量': hh(m), '高清图片': hh(m), '图标': hh(m), 'PSD素材': hh(m), '字体': hh(m), '英文字体': hh(m), '音效': hh(m), 'PPT模板': hh(m), '简历模板': hh(m) } # print(dic) list.append(dic) bccsv(filename, dic) hh = ",".join(hh(m)) print(hh, type(hh)) sql = "insert into sc (矢量,高清图片,图标,PSD素材,字体,英文字体,音效,PPT模板,简历模板) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % ( hh, hh, hh, hh, hh, hh, str(hh), hh, hh) cursor.execute(sql) with open("素材.json", "w", encoding="utf-8") as json_file: json_file = json.dump({ "total": t * j, "data": list }, json_file, ensure_ascii=False, indent=4) print("爬取%s条" % (t))
def url(): url = "http://music.taihe.com/" response = requests.get(url, headers=sj_User_Agent()) return response.text
list = [] filename = "音乐.csv" header_csv = [ '最热图片地址', '最热热度', '最热歌曲名', '最热来自', '最热类型', '最新图片地址', '最新热度', '最新歌曲名', '最新来自', '最新类型' ] with open(filename, "w", encoding="utf-8-sig", newline="") as csv_file: csv_file = csv.DictWriter(csv_file, header_csv) csv_file.writeheader() for i in range(1, 180): print("第%s页开始" % (i)) if i == 1: sj = random.randint(1, 10) yiurl = "http://music.taihe.com%s" % (r) try: yiheader = sj_User_Agent() time.sleep(sj) yiresponse = requests.get(yiurl, headers=yiheader) yiresponse.encoding = "utf-8" print(yiresponse) cs(yiresponse.text, i) except Exception as e: print("这个请求头不可用!!!%s" % (e)) else: sj = random.randint(1, 5) s = 20 * (i - 1) yiurl = "http://music.taihe.com/songlist/tag/%E5%85%A8%E9%83%A8?orderType=1&third_type=" page = {"offset": s} try: yiheader = sj_User_Agent() time.sleep(sj)