def get_res(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", } res = requests.get(url, headers=headers).json() items = res["data"]["items"] for i in items: e = etree.HTML(i["context"]) proName2 = i["proName"] parentName1 = i["parentName"] context = i["context"] # print(i["context"]) title = i["title"] # print(title) id = i["id"] # print(title) lujing = e.xpath("//img[@alt]/@src") programaId = i["programaId"] proParentID = i["proParentID"] created = int( time.mktime(time.strptime(i["created"], '%Y-%m-%d %H:%M:%S'))) * 1000 for j in lujing: print(j) if "documents" in j: print(j) conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() SQL = 'insert into huanjingdatadeal2_copy1 (title,url,id_id,programaId,created,proParentID,documentsurl,parentName1,proName2) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)' cur.execute(SQL, (title, url, id, programaId, created, proParentID, j, parentName1, proName2)) conn.commit() print("写入成功") cur.close() conn.close() elif "http://www.encollege.cn/imageFile/hjxx/" in j: res1 = requests.get(j, headers=headers) if res1.status_code != 200: conn = pool.connection( ) # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() SQL = 'insert into huanjingdatadeal2_copy1 (title,url,id_id,programaId,created,proParentID,documentsurl,parentName1,proName2) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)' cur.execute(SQL, (title, url, id, programaId, created, proParentID, j, parentName1, proName2)) conn.commit() print("写入成功") cur.close() conn.close() else: continue
def get_road(dataiter): for jk in range(0, len(dataiter) + 1): i = dataiter[jk] addresses = i[6] id = i[0] # lane = i[5] if "路" in addresses and "区" in addresses: road = re.findall("区(.*?)路", addresses) print(addresses) print(road) conn2 = pool.connection() cur2 = conn2.cursor() print(id, i[3], i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET road="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}路".format(road[0]), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() elif "道" in addresses and "区" in addresses: road = re.findall("区(.*?)道", addresses) print(addresses) print(road) conn2 = pool.connection() cur2 = conn2.cursor() print(id, i[3], i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET road="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}道".format(road[0]), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() elif "街" in addresses and "区" in addresses: road = re.findall("区(.*?)街", addresses) print(addresses) print(road) conn2 = pool.connection() cur2 = conn2.cursor() print(id, i[3], i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET road="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}街".format(road[0]), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close()
def save_sql(data_processing_list, poi_id): isupdataid = 0 for i in data_processing_list: try: print("===============i===========") print(i) # print('============================i["children"],i["photos"]=============') # print(i["children"],i["photos"]) # type = i["detail_info"]["tag"] conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() SQL = 'insert into tb_tenxun (province,area,town,addresses,unit,longitude,latitude,cType) value ("%s","%s","%s","%s","%s","%s","%s","%s")' cur.execute( SQL % (i["ad_info"]["province"], i["ad_info"]["district"], i["ad_info"]["city"], i["address"], i["title"], i["location"]["lng"], i["location"]["lat"], i["category"])) conn.commit() print("插入成功") cur.close() conn.close() except Exception as e: isupdataid = 1 print(e) print("提交失败") if isupdataid == 0: conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() # SQL = 'insert into tb_tenxun (province,area,town,addresses,unit,longitude,latitude,type) value ("%s","%s","%s","%s","%s","%s","%s","%s")' SQL = 'UPDATE tenxun_poi SET isspider=1 WHERE id="%s"' cur.execute(SQL % (poi_id)) conn.commit() print("修改成功") cur.close() conn.close() print("改写id成功") else: print("不修改此条数据")
def get_hao_l(dataiter): for jk in range(0, len(data) + 1): i = dataiter[jk] id = i[0] addresses = i[6] if "号" in addresses and "-" not in addresses and "一" not in addresses and "、" not in addresses and "~" not in addresses: doorplate = re.findall("上海市.*?(\d+)号", addresses) if doorplate: conn2 = pool.connection() cur2 = conn2.cursor() sql1 = 'UPDATE tb_tenxun_copy1 SET doorplate="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}号".format(doorplate[0]), id)) conn2.commit() # print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() # print(addresses) # print(doorplate) elif "-" in addresses: doorplate = re.findall("上海市.*?(\d+)-(\d+)号", addresses) if doorplate and "N" not in addresses and "L" not in addresses and "G" not in addresses and "F" not in addresses: if int(doorplate[0][0]) < int(doorplate[0][1]): print(int(doorplate[0][0])) print(int(doorplate[0][1])) conn2 = pool.connection() cur2 = conn2.cursor() sql1 = 'UPDATE tb_tenxun_copy1 SET doorplate="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}-{}号".format( int(doorplate[0][0]), int(doorplate[0][1])), id)) conn2.commit() # print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() print(addresses) print(doorplate) else: continue
def select_poi_sql(): try: conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() SQL = "select id,poi from tenxun_poi where isspider=0" cur.execute(SQL) addres = cur.fetchall() iter_address = iter(addres) return iter_address # conn.commit() # print("插入成功") except Exception as e: print(e) finally: cur.close() conn.close()
def get_lane(dataiter): for jk in range(0, len(dataiter) + 1): i = dataiter[jk] addresses = i[6] id = i[0] if "弄" in addresses: lane = re.findall(".*?(\d+)弄", addresses) print(addresses) print(lane) if lane: conn2 = pool.connection() cur2 = conn2.cursor() print(id, i[3], i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET lane="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}弄".format(lane[0]), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close()
def get_address(dataiter): for jk in range(0, len(dataiter) + 1): i = dataiter[jk] addresses = i[6] id = i[0] road = i[5] # print(i[16]) doorplate = i[8] print(i) if road and doorplate: conn2 = pool.connection() cur2 = conn2.cursor() # print(id, i[3],i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET address="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}{}{}{}".format(i[3], road, i[6], doorplate), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close()
def get_village(dataiter): for jk in range(0, len(dataiter) + 1): i = dataiter[jk] addresses = i[6] id = i[0] road = i[16] # print(i[16]) if "镇" in road: village = re.findall("(.*?)镇", road) print(road) print(village) if village: conn2 = pool.connection() cur2 = conn2.cursor() # print(id, i[3],i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET village="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}镇".format(village[0]), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close()
def get_res_and_save(parms_list): for i in parms_list: # print(i) time.sleep(3) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", } url = "https://www.encollege.cn/gwapi/article/find" res = requests.get(url, headers=headers, params=i).json() items = res["data"]["items"] for i in items: context = i["context"] e = etree.HTML(str(context)) # print(type(context)) # print(context) proName2 = i["proName"] parentName1 = i["parentName"] # context = i["context"] # print(i["context"]) title = i["title"] # print(title) id = i["id"] # print(title) lujing = e.xpath("//img[@alt]/@src") programaId = i["programaId"] #programaId proParentID = i["proParentID"] createdlocal = i["created"] created = int( time.mktime(time.strptime(i["created"], '%Y-%m-%d %H:%M:%S'))) * 1000 for j in lujing: print(j) if "documents" in j: firstnum = re.findall("/documents/(\d+)/\d+", j)[0] secondnum = re.findall("/documents/\d+/(\d+)", j)[0] print(j) if "t=" in j: tunix = j.split("t=")[1] tunixlocal = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(int(tunix) / 1000))) else: tunix = None tunixlocal = None conn = pool.connection( ) # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() SQL = 'insert into hjxx_home_copy1 (title,url,id_id,programaId,created,proParentID,documentsurl,parentName1,proName2,firstnum,secondnum,timeunix,timeunixlocal,createdlocal) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' cur.execute( SQL, (title, url, id, programaId, created, proParentID, j, parentName1, proName2, firstnum, secondnum, tunix, tunixlocal, createdlocal)) conn.commit() print("写入成功") cur.close() conn.close() # elif "http://www.encollege.cn/imageFile/hjxx/" in j: # res1 = requests.get(j,headers=headers) # if res1.status_code !=200: # conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 # cur = conn.cursor() # SQL = 'insert into hjxx_home (title,url,id_id,programaId,created,proParentID,documentsurl,parentName1,proName2) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)' # cur.execute(SQL, (title, url, id, programaId, created, proParentID, j, parentName1, proName2)) # conn.commit() # print("写入成功") # cur.close() # conn.close() else: continue
print(i) if road and doorplate: conn2 = pool.connection() cur2 = conn2.cursor() # print(id, i[3],i[4]) sql1 = 'UPDATE tb_tenxun_copy1 SET address="%s" WHERE id="%s"' cur2.execute(sql1 % ("{}{}{}{}".format(i[3], road, i[6], doorplate), id)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur1 = conn.cursor() SQL = 'select * from tb_tenxun_copy1' cur1.execute(SQL) data = cur1.fetchall() dataiter = data cur1.close() conn.close() # 分离出号 # get_hao_l(dataiter) # 分离出路 # get_road(dataiter) # 分离出弄 # get_lane(dataiter) # 分离出镇 # get_village(dataiter)
import requests from lxml import etree from tools.sqlconn import pool url = "https://lbs.qq.com/service/webService/webServiceGuide/webServiceAppendix" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36" } res = requests.get(url, headers=headers) etreepoi = etree.HTML(res.text) poilist = etreepoi.xpath( '//*[@id="__layout"]/div/div[2]/div/div[2]/div/div/article/div/div/table/tbody/tr/td[3]/text()' ) print(poilist) for i in poilist: conn2 = pool.connection() cur2 = conn2.cursor() # print(id, i[3], i[4]) sql1 = 'insert into tenxun_poi (poi) values ("%s")' cur2.execute(sql1 % (i)) conn2.commit() print("提交成功") # print(i[0],i[3],i[1]) cur2.close() conn2.close() # print(etreepoi)