def insert_cate(url): r = api(url) i = 1 for gene in r.keys(): if gene != 'ok': for cate in r[gene]: if cate['mins']: for item in cate['mins']: with session_scope() as sqlsession: bc = BookCategory(id=None, category_major=cate['major'], category_min=item, male_female=gene, time_created=round(time.time()), status=1, cover='', sort=i) sqlsession.add(bc) i = i + 1 else: with session_scope() as sqlsession: bc = BookCategory(id=None, category_major=cate['major'], category_min="", male_female=gene, time_created=round(time.time()), status=1, cover='', sort=i) sqlsession.add(bc) i = i + 1
def zengliang_back(): atts = globals() for item1 in query_map.keys(): table = atts.get(item1) session_remote = session_sql_remote() ms = session_remote.query(table).order_by(table.id.desc()).first() with session_scope() as sess1: if not ms: dd = [0] else: res = sess1.execute(query_map.get(item1).format(**ms.__dict__)) dd = [] for id in res.fetchall(): dd.append(id[0]) id_new = max(dd) if len(dd) >= 2: print("youchongfu {} {}".format(item1, dd)) ms = sess1.query(table).filter(table.id > id_new).all() count = 0 for item in ms: print(item.__dict__) temp = item.__dict__ temp["id"] = None temp.pop("_sa_instance_state") ta = table(**temp) count = count + 1 session_remote.add(ta) if count % 1000 == 0: session_remote.commit() session_remote.commit()
def get_phone(self): url = "http://hotel.tuniu.com/ajax/getHotelStaticInfo?id={}&checkindate={tomorrow}&checkoutdate={aftert}" count = 1 # res = [] today = datetime.date.today() with session_scope() as sess2: tn = sess2.query(TuNiu).filter(TuNiu.phone == None).all() for item in tn: hotel_id = item.url.split("/")[-1].strip() count = count + 1 # if count < 39: # continue # res.append(hotel_id) # try: # with futures.ProcessPoolExecutor(max_workers=10) as executor: # for item in executor.map(self.sub_get_phone, res): # print(item) # except KeyboardInterrupt: # exit(0) r = self.session.get(url.format(hotel_id, tomorrow=today + datetime.timedelta(days=1), aftert=today + datetime.timedelta(days=2))) # count = count + 1 try: temp = r.json() item.phone = temp.get("data").get("hotel").get("tel") item.district = temp.get("data").get("hotel").get("districtName") sess2.commit() except AttributeError as e: print(hotel_id, e) if "list" in str(e): continue else: raise e print(temp.get("data").get("hotel").get("tel"))
def _p_list(self, url, category): time.sleep(0.3) if "pn1/" in url: url = url[:url.find("pn1/")] print("list url {}".format(url)) self.session.headers["Host"] = "b2b.huangye88.com" r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") form = soup.find("form", {"id": "jubao"}) mas = form.find_all("dl") for dl in mas: res = {} res["category"] = category a1 = dl.find("a", {"rel": "nofollow"}) if a1: res["phone"] = a1.text dds = dl.find_all("dd") for dd in dds: if not dd.has_attr("class"): res["products"] = dd.text[:-4] a = dl.find("a", {"itemprop": "name"}) if a: d_u = a.get("href") + "company_detail.html" res["enterpriseName"] = a.text res["url"] = d_u with session_scope() as sess: hy = sess.query(HuangYe).filter(HuangYe.url == d_u).first() if not hy: result = self._detail(d_u) res.update(result) HY = HuangYe(**res) sess.add(HY)
def _plist(self, url, category): print("list {}".format(url)) time.sleep(0.2) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") div = soup.find("div", class_="company-info") lis = div.find_all("li") for li in lis: res = {} res["category"] = category tel = li.find("div", class_="tel") res["phone"] = tel.text a = li.find("a") res["enterpriseName"] = a.text d_u = "http://hy.taojindi.com" + a.get("href") res["url"] = d_u div = li.find("div", class_="info") res["about"] = div.text div = li.find("div", class_="address") temp = div.text.split() for item in temp: if "地址:" in item: res["address"] = item[len("地址:"):] elif "主营产品:" in item: res["products"] = item[len("主营产品:"):] with session_scope() as sess: cns = sess.query(TaoJin).filter( TaoJin.url == res["url"]).first() if not cns: resu = self._detail(res["url"]) res.update(resu) cn = TaoJin(**res) sess.add(cn)
def _plist(self, url): time.sleep(0.2) print("list {}".format(url)) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") divt = soup.find("div", class_="left_box") divs = divt.find_all("div", class_="list") for div in divs: res = {} lis = div.find_all("li") for li in lis: a = li.find("a") if a: res["url"] = a.get("href") tem = li.text.split() if not tem: continue res["enterpriseName"] = tem[0] else: if "主营:" in li.text: res["products"] = li.text[li.text.find("主营:") + 3:] else: res["businessModel"] = li.text.strip() td = div.find("td", {"class": "f_orange", "width": "100"}) if td: res["location"] = td.text with session_scope() as sess: cns = sess.query(MetalInc).filter( MetalInc.url == res["url"]).first() if not cns: resu = self._detail(res["url"]) res.update(resu) cn = MetalInc(**res) sess.add(cn)
def parse_jiehun_item(session, url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: time.sleep(random.uniform(1, 3)) print("pase jiehun url {}".format(url)) resu = {} jiehun_url = "https://www.meituan.com/jiehun/{}/" # session.headers[ # "Cookie"] = "__mta=146208011.1562725971505.1562821920182.1562822162903.6; _lxsdk_cuid=16bd9b99c9ec8-035ab9a6954478-36664c08-1fa400-16bd9b99ca0c8; client-id=047f9384-30b4-4cce-aedb-773f7a31fd8a; mtcdn=K; uuid=3b49df191ddb4094bc3c.1562729907.1.0.0; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; ci=45; rvct=45%2C114; IJSESSIONID=vguqtn4rvu70q8m6y7wyaoli; iuuid=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; cityname=%E9%87%8D%E5%BA%86; _lxsdk=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; _hc.v=10962146-cd2f-a7a9-c15a-d942a6e12989.1562744821; __mta=146208011.1562725971505.1562742466866.1562812609604.4; lat=29.535538; lng=106.512486; _lxsdk_s=16bdf56dfcd-a48-5e0-95b%7C%7C18" r = session.get(url, timeout=5) rule = r'window.AppData = (.+?);</script>' slotList = re.findall(rule, r.text) if slotList: res = json.loads(slotList[0]) # print(res) # if res.get("poiParam").get("uuid"): # session.headers["Cookie"] = cookies.format(res.get("poiParam").get("uuid")) shoplist = res.get("searchResult").get("searchResult") for item in shoplist: resu["score"] = item.get("avgscore") resu["shop"] = item.get("title") resu["address"] = item.get("address") shop_id = item.get("id") target = jiehun_url.format(shop_id) resu["url"] = target res = parse_jiehun_phone(session, target) resu.update(res) mt = MeiTuanShop(**resu) with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: print("获取不到值 {}".format(url)) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 0, "date": time.time() }) else: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 0, "date": time.time() })
def p_list(self, url, area): print("page url {}".format(url)) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") ul = soup.find("ul", class_="list") mas = ul.find_all("a") for a in mas: res = {} res["area"] = area name = a.get("title") res["enterpriseName"] = name if not a.get("href").startswith("http"): d_u = "https:" + a.get("href") else: d_u = a.get("href") res["url"] = d_u temp = a.next_sibling.next_sibling font = temp.find("font") if font: res["updateTime"] = font.text with session_scope() as sess: wgs = sess.query(BFZY).filter(BFZY.url == d_u).first() if not wgs: resu = self.detail(d_u) res.update(resu) wg = BFZY(**res) sess.add(wg) print(res)
def get_data(self): page = 2 url = "http://hotel.tuniu.com/ajax/list?search%5BcityCode%5D=300&search%5BcheckInDate%5D=2019-7-30&search%5BcheckOutDate%5D=2019-7-31&search%5Bkeyword%5D=&suggest=&sort%5Bfirst%5D%5Bid%5D=recommend&sort%5Bfirst%5D%5Btype%5D=&sort%5Bsecond%5D=&sort%5Bthird%5D=cash-back-after&page={}&returnFilter=0" while True: print("the page is {}".format(page)) r = self.session.get(url.format(page)) r.encoding = "utf-8" # res = json.loads(r.text) # print(r.json()) if not r.json(): print("now page is {}".format(page)) break data_list = r.json().get("data").get("list") for item in data_list: res = {} res["address"] = item.get("address") # print(item.get("name")) d_url = "http://hotel.tuniu.com" + item.get("url") res["url"] = d_url res["shop"] = item.get("name") res["score"] = item.get("remarkScore") res["price"] = item.get("startPrice") res["decorateYear"] = item.get("decorateYear") area = json.dumps(item.get("pos")) res["area"] = area tn = TuNiu(**res) with session_scope() as sess: qxc = sess.query(TuNiu).filter( TuNiu.url == res["url"]).first() if not qxc: sess.add(tn) print(res) page = page + 1 time.sleep(0.5)
def parse_qianyann_item(url): sess.headers[ "Cookie"] = "UM_distinctid=16bf50a8450476-00b7d0ed2a109b-e343166-1fa400-16bf50a8451895; _gscu_1516296093=631843228bhl3k13; _gscbrs_1516296093=1; Hm_lvt_062d51b4dcc0576135b683257033659a=1563184338; Hm_lpvt_062d51b4dcc0576135b683257033659a=1563242618; _gscs_1516296093=t6324261780strk14|pv:1" r = sess.get(url) soup = BeautifulSoup(r.text, "lxml") table = soup.find_all("table", { "cellpadding": "0", "cellspacing": "0", "width": "98%" }) trs = table[0].find_all("tr") for item in trs: res = {} tds = item.find_all("td") title = tds[1].a.text publish_date = tds[2].span.text.strip() res["title"] = title res["publishDate"] = publish_date de_url = qianyan_home + tds[1].a.get("href") res["url"] = de_url ess = es_search("govnews", de_url) if ess[0] and ess[1]: pass else: resu = parse_qianyan_detail(de_url) res.update(resu) gw = GoverNews(**res) with session_scope() as sess1: sess1.add(gw) EsBackends("govnews").index_data({ "link": de_url, "status": 1, "date": time.time() })
def gov_news(self, url): r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") res = {} ul = soup.find("ul", class_="pList01") lis = ul.find_all("li") for item in lis: date = item.span.text date = date[1:-1] res["publishDate"] = date if "http" not in item.a.get("href"): new_url = self.home + item.a.get("href") else: new_url = item.a.get("href") ess = es_search("govnews", new_url) if ess[0] and ess[1]: pass else: try: resu = self.parse_detail(new_url) except Exception as e: print(e) continue res.update(resu) gw = GoverNews(**res) with session_scope() as sess: sess.add(gw) EsBackends("govnews").index_data({ "link": new_url, "status": 1, "date": time.time() })
def final_parse(self, url, count, area): r = self.session.get(url.format(count)) print("kaishi jiexi ") soup = BeautifulSoup(r.text, "lxml") div = soup.find("div", class_="left_box") uls = div.find_all("ul") with session_scope() as sess: for item in uls: res = {} res["location"] = area a = item.find("a") res["enterpriseName"] = a.get("title") res["url"] = a.get("href") lis = item.find_all("li") mb = lis[1].text res["primaryBusiness"] = mb.strip() if len(lis) > 2: phone = lis[2].text.strip() res["phone"] = phone wgs = sess.query(WGQY).filter(WGQY.url == res["url"]).first() if not wgs: result = self.parse_detail(res["url"]) res.update(result) wg = WGQY(**res) sess.add(wg) print("wanbi")
def get_hotel_detail(url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: result = {} time.sleep(random.uniform(1, 3)) r = session.get(url) print("parse hotel {}".format(url)) r.encoding = "utf-8" soup = BeautifulSoup(r.text, 'lxml') naspan = soup.find("div", {"class": "breadcrumb-nav"}) result["shop"] = naspan.text.strip() result["url"] = url result["openTime"] = "全天" div = soup.find("div", {"class": "mb10"}) span = div.find("span") result["address"] = span.text.strip() li = soup.find("li", {"class": "fs14"}) divs = li.find_all("div", {"class": "mb10"}) item = divs[-1] if "电话" in item.text: phone = item.text[item.text.find(":") + 1:] result["phone"] = phone score = soup.find("div", {"class": "other-detail-line1-score"}) result["score"] = score.text.strip() mt = MeiTuanShop(**result) if result: result["url"] = url with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 0, "date": time.time() }) else: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 0, "date": time.time() }) print("获取值为空 {}".format(url))
def parse_jiehun(self, url, area, locate): r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") div = soup.find('div', {"id": "J_boxList"}) ul = div.find("ul", class_="shop-list") lis = ul.find_all("li") for item in lis: res = {} res["cateUrl"] = url res["locate"] = locate res["area"] = area.strip() a = item.a if a.get("title"): res["shop"] = a.get("title") if a.get("href"): jiehun_url = self.url_home + a.get("href") index = jiehun_url.find("?") res["url"] = jiehun_url[:index] p = item.find_all("p", class_="area-list") if p: p = p[0] res["address"] = p.text.strip() score = item.find("p", class_="remark") if score: res["score"] = score.span.get("title") # r2 = self.session.get(jiehun_url) # print(jiehun_url) # soup = BeautifulSoup(r2.text, "lxml") # div = soup.find("div", class_="offers-box") # if not div: # div = soup.find("div", class_="shop-wrap") # if not div: # div = soup.find_all("div", {"id": "J_boxYouhui", "class": "textshow"})[0] # span = div.find_all("span", class_="fl")[0] # if span.get("title"): # res["address"] = span.get("title") # sp = div.find_all("span", class_="icon-phone")[0] # res["phone"] = sp.text # else: # h1 = div.find("h1", class_="shop-title") # span = div.find("span", class_="fl road-addr") # address = span.text.strip() # res["address"] = address # phone = div.find("span", class_="icon-phone") # res["phone"] = " ".join(phone.text.split()).strip() # else: # span = div.find("span", class_="info-name") # address = span["title"] # res["address"] = address.strip() # p = div.find("p", class_="expand-info tel") # sp = p.find("span", class_="item") # res["phone"] = " ".join(sp.text.split()).strip() dz = DZDianPing(**res) with session_scope() as sess: qxc = sess.query(DZDianPing).filter( DZDianPing.url == res["url"]).first() if not qxc: sess.add(dz) print(res)
def parse_detail(self, url): ess = es_search("qiyeminglu", url) if not ess[1] or not ess[0]: time.sleep(random.uniform(1.5, 2)) print("parse url {}".format(url)) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") fs = soup.find("fieldset", {"class": "ad_biger"}) lis = fs.div.find_all("li") res = {} for li in lis: name = li.find("span", {"class": "field-label"}).text.strip() value = li.find("span", {"class": "field-item"}).text.strip() if "点击" in value: index = value.find("点击") value = value[:index - 1] if "统一社会信用" in name: value = re.findall(patern, value)[0] res["socialCreditCode"] = value if "名称" in name: res["enterpriseName"] = value if "地址" in name: res["address"] = value if "地区" in name: res["area"] = value.strip() if "日期" in name: res["registerDate"] = value if "范围" in name: res["businessScope"] = value if "代表人" in name: res["legalRepresentative"] = value if "资金" in name: res["registeredFunds"] = value if "类型" in name: if value: res["enterpriseType"] = value else: value = lis[-1].find("span", { "class": "field-item" }).span if value: res["enterpriseType"] = value.text.strip() print(value.text) ecq = EnterpriseCq(**res) with session_scope() as session1: session1.add(ecq) if not ess[0]: EsBackends("qiyeminglu").index_data({ "link": url, "status": 1, "date": time.time() }) else: EsBackends("qiyeminglu").update_data({ "link": url, "status": 1, "date": time.time() })
def index_es(): with session_scope() as session: books = session.query(Book).all() for book in books: data = {} data["title"] = book.title data["author"] = book.author_name EsBackends("crawled_books", "bookinfo").index_data(data) print('the {} is insert'.format(book.title))
def fix_funds(): with session_scope() as sess: na = sess.query(BFZY).filter(BFZY.registeredFunds == None).all() for item1 in na: if item1.about: tem = item1.about.split(";") for item in tem: t = item.split(":") if "注册资金" == t[0]: item1.registeredFunds = t[1]
def jietu(): with session_scope() as sess: dz = sess.query(DZDianPing).filter(DZDianPing.phone == None).all() for item in dz: url = item.url file_name = url.split("/")[-1] phone_file = 'E:\\images\\{}.png'.format(file_name + "phone") if not os.path.exists(phone_file): sa = cut_imge(item.url) time.sleep(2)
def tuniutongbu(): with session_scope() as sess: with session_scope_remote() as sess_remote: na = sess.query(TuNiuAll).filter( or_(TuNiuAll.phone != None, TuNiuAll.district != None)).all() for item in na: ture = sess_remote.query(TuNiuAll).filter( TuNiuAll.url == item.url).first() if not ture.phone and not ture.district: ture.phone = item.phone ture.district = item.district sess_remote.commit()
def get_all_data(self, cid, city): page = 1 today = datetime.date.today() while True: if city == self.current_city and page == self.current_page: self.status = True if not self.status: page = page + 1 print("tiaoguo {} {}".format(city, page)) continue print("the page is {}".format(page)) r = self.session.get(self.url.format(cid=cid, page=page, tomorrow=today + datetime.timedelta(days=1), aftert=today + datetime.timedelta(days=2))) r.encoding = "utf-8" temp = r.json().get("data") try: total = temp.get("total") except Exception as e: end_time = time.time() if end_time - self.start_time > 100: with open("config.json", "r+", encoding="utf-8") as f: lj = json.load(f) with open("config.json", "w", encoding="utf-8") as f: lj["current_page"] = page lj["current_city"] = city json.dump(lj, f) raise e if not total: print("now page is {}".format(page)) break data_list = temp.get("list") for item in data_list: res = {} res["city"] = city res["address"] = item.get("address") # print(item.get("name")) d_url = "http://hotel.tuniu.com" + item.get("url") res["url"] = d_url res["shop"] = item.get("name") res["score"] = item.get("remarkScore") res["price"] = item.get("startPrice") res["decorateYear"] = item.get("decorateYear") area = json.dumps(item.get("pos")) res["area"] = area tn = TuNiuAll(**res) with session_scope() as sess: qxc = sess.query(TuNiuAll).filter(TuNiuAll.url == res["url"]).first() if not qxc: sess.add(tn) print(res) page = page + 1
def get_products(driver, url): driver.get(url) r = driver.page_source soup = BeautifulSoup(r, "lxml") all_li = soup.find_all('li', {"class": "gl-item"}) for item in all_li: try: name = item.find("div", {"class": "p-name"}) name = name.text.strip() image = item.find("img") if image.get("data-lazy-img"): image = "https:" + image.get("data-lazy-img") else: image = "https:" + image.get("src") shop = item.find("div", {"class": "p-shop"}) shop = shop.find("a") if shop: shop = shop.get("title") shop = shop.strip() else: shop = "" price = item.find("div", {"class": "p-price"}) price = price.find("strong", {"class": "J_price"}) price = price.text.strip() about = item.find("div", {"class": "p-commit p-commit-n"}) if not about: about = item.find("div", {"class": "p-commit"}) about = about.text.strip() detail_url = item.find("a", {"target": "_blank"}) detail_url = "https:" + detail_url.get("href") res = { "id": None, "productName": name, "image": image, "shop": shop, "price": price, "popular": about, } try: result = get_comment(detail_url) except: continue res.update(result) jd = JingDong(**res) with session_scope() as session: session.add(jd) except: print("货品条目错误") print(traceback.print_exc()) continue
def incre_table(): res = cate_url() cc = 1 rr = 15 for item in res.keys(): for dd in res[item].keys(): with session_scope() as session: d = BookCategory(id=cc, category_name=dd, site_id=9, site_category_id=cc, category_id=rr) session.add(d) cc = cc + 1 rr = rr + 1
def delete_book_id(): session1 = session_sql() books = session1.query(BookSource).filter_by(site_id=9).all() # session1.execute() session1.close() for book in books: session2 = session_sql() # session2.query(Bookchapter).filter(Bookchapter.book_id == book.book_id).delete(synchronize_session=False) bs = session2.query(Bookchapter).filter_by(id=book.book_id).all() session2.commit() books1 = session2.query(Book).filter_by(id=book.book_id).all() session2.close() for b_chapter in bs: if b_chapter: with session_scope() as session3: session3.delete(b_chapter) print('delete ', b_chapter.title) for book1 in books1: if book1: with session_scope() as session4: session4.delete(book1) print('delete book', book1.id) with session_scope() as session5: session5.delete(book)
def get_phone(self): url = "http://hotel.tuniu.com/ajax/getHotelStaticInfo?id={}&checkindate=2019-07-31&checkoutdate=2019-08-01" # count = 1 with session_scope() as sess1: tn = sess1.query(TuNiu).filter(TuNiu.phone == None).all() for item in tn: hotel_id = item.url.split("/")[-1].strip() r = self.session.get(url.format(hotel_id)) # count = count + 1 temp = r.json() item.phone = temp.get("data").get("hotel").get("tel") item.district = temp.get("data").get("hotel").get( "districtName") sess1.commit() print(temp.get("data").get("hotel").get("tel")) time.sleep(0.5)
def dump_table(): session1 = session_sql1() ss = session1.query(BookCategory).all() session1.close() for s1 in ss: d = BookCategory(id=s1.id, category_major=s1.category_major, category_min=s1.category_min, male_female=s1.male_female, sort=s1.sort, time_created=s1.time_created, status=s1.status, cover=s1.status, cate_id=s1.cate_id) with session_scope() as session2: session2.add(d)
def cate_table(): with session_scope() as session: i = 1 for items in category(_cate): if items[1]: for item in items[1]: b = BookCategory(id=None, category_major=item[0], category_min=item[1], male_female=items[0], sort=i, time_created=round(time.time()), status=1, cover='') i = i + 1 session.add(b)
def _p_list(self, url, category): time.sleep(0.2) print("list url {}".format(url)) result = {} result["category"] = category r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") tables = soup.find_all("table", { "width": "980", "border": "0", "cellspacing": "0", "cellpadding": "0" }) res = OrderedDict() for table in tables: if "企业录推荐企业信息" in table.text: table = table.next_sibling.next_sibling trs = table.find_all("tr") for tr in trs: tds = tr.find_all("td") for td in tds: ma = td.find_all("a", {"target": "_blank"}) for a in ma: if not a.get("title"): detail = (a.text, a.get("href")) else: products = a.text font = td.find("font", {"color": "#666666"}) entype = font.text res[detail[1]] = (detail[0], products, entype) for key, value in res.items(): result["enterpriseName"] = value[0] result["products"] = value[1] result["businessModel"] = value[2] result["url"] = key if self.jump in key: self.status = True if not self.status: continue with session_scope() as sess: qy = sess.query(QYLu).filter(QYLu.url == key).first() if not qy: resu = self._detail(key) result.update(resu) sou = QYLu(**result) sess.add(sou)
def _list_item(self, url, category, location): time.sleep(0.6) print("list url {}".format(url)) self.session.headers["Referer"] = url[:url.find( "/p")] + "/p{}/".format(self.index_page(url) - 1) r = self.session.get(url) r.encoding = "utf-8" soup = BeautifulSoup(r.text, "lxml") div = soup.find("div", class_="hy_lbox fl mt2") div = div.find("div", class_="hy_companylist") lis = div.find_all("li") for li in lis: res = {} a = li.find("a") if a.get("href").startswith("http"): d_u = a.get("href") else: d_u = "http:" + a.get("href") res["url"] = d_u res["category"] = category res["location"] = location res["enterpriseName"] = a.text span = li.find("span", class_="tel") if hasattr(span, "text"): res["phone"] = span.text dds = li.find_all("dd") temp = [] for dd in dds: temp.append(dd.text) ss = "地址:" ss1 = "主营产品:" for item in temp: if ss in item: res["address"] = item[len(ss):] elif ss1 in item: res["products"] = item[len(ss1):] with session_scope() as sess: soule = sess.query(SouLeWang).filter( SouLeWang.url == d_u).first() if not soule: resu = self._detail(d_u) res.update(resu) sou = SouLeWang(**res) sess.add(sou)
def table_back(): atts = globals() for item1 in datatable: table = atts.get(item1) count = 0 session_remote = session_sql_remote() with session_scope() as sess1: ms = sess1.query(table).filter().all() for item in ms: print(item.__dict__) temp = item.__dict__ temp["id"] = None temp.pop("_sa_instance_state") ta = table(**temp) count = count + 1 session_remote.add(ta) if count % 5000 == 0: session_remote.commit() session_remote.commit()
def _p_list(self, url, category): time.sleep(1) print("list {}".format(url)) res = {} res["category"] = category self.session.headers["Host"] = "www.cntrades.com" self.session.headers["Cookie"] = self.cookies.format( int(time.time()) - 1, int(time.time()) - 2) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") divv = soup.find("div", class_="left_box") if not divv: if "404 Not Found" in r.text: raise Exception("404") if "是不是太快了?" in r.text: raise Exception("too fast") divs = divv.find_all("div", class_="list") for div in divs: lis = div.find_all("li") for li in lis: a = li.find("a") if a: res["url"] = a.get("href") tem = li.text.split() res["enterpriseName"] = tem[0] res["businessModel"] = tem[-1] else: if "主营:" in li.text: res["products"] = li.text[li.text.find("主营:") + 3:] else: res["address"] = li.text td = div.find("td", {"class": "f_orange", "width": "100"}) if td: res["location"] = td.text with session_scope() as sess: cns = sess.query(CnTrade).filter( CnTrade.url == res["url"]).first() if not cns: resu = self._detail(res["url"]) res.update(resu) cn = CnTrade(**res) sess.add(cn)