예제 #1
0
def insert_cate(url):
    r = api(url)
    i = 1
    for gene in r.keys():
        if gene != 'ok':
            for cate in r[gene]:
                if cate['mins']:
                    for item in cate['mins']:
                        with session_scope() as sqlsession:
                            bc = BookCategory(id=None,
                                              category_major=cate['major'],
                                              category_min=item,
                                              male_female=gene,
                                              time_created=round(time.time()),
                                              status=1,
                                              cover='',
                                              sort=i)
                            sqlsession.add(bc)
                            i = i + 1
                else:
                    with session_scope() as sqlsession:
                        bc = BookCategory(id=None,
                                          category_major=cate['major'],
                                          category_min="",
                                          male_female=gene,
                                          time_created=round(time.time()),
                                          status=1,
                                          cover='',
                                          sort=i)
                        sqlsession.add(bc)
                        i = i + 1
예제 #2
0
def zengliang_back():
    atts = globals()
    for item1 in query_map.keys():
        table = atts.get(item1)
        session_remote = session_sql_remote()
        ms = session_remote.query(table).order_by(table.id.desc()).first()
        with session_scope() as sess1:
            if not ms:
                dd = [0]
            else:
                res = sess1.execute(query_map.get(item1).format(**ms.__dict__))
                dd = []
                for id in res.fetchall():
                    dd.append(id[0])
            id_new = max(dd)
            if len(dd) >= 2:
                print("youchongfu {} {}".format(item1, dd))
            ms = sess1.query(table).filter(table.id > id_new).all()
            count = 0
            for item in ms:
                print(item.__dict__)
                temp = item.__dict__
                temp["id"] = None
                temp.pop("_sa_instance_state")
                ta = table(**temp)
                count = count + 1
                session_remote.add(ta)
                if count % 1000 == 0:
                    session_remote.commit()
            session_remote.commit()
예제 #3
0
 def get_phone(self):
     url = "http://hotel.tuniu.com/ajax/getHotelStaticInfo?id={}&checkindate={tomorrow}&checkoutdate={aftert}"
     count = 1
     # res = []
     today = datetime.date.today()
     with session_scope() as sess2:
         tn = sess2.query(TuNiu).filter(TuNiu.phone == None).all()
         for item in tn:
             hotel_id = item.url.split("/")[-1].strip()
             count = count + 1
             # if count < 39:
             #     continue
             # res.append(hotel_id)
             # try:
             #     with futures.ProcessPoolExecutor(max_workers=10) as executor:
             #         for item in executor.map(self.sub_get_phone, res):
             #             print(item)
             # except KeyboardInterrupt:
             #     exit(0)
             r = self.session.get(url.format(hotel_id, tomorrow=today + datetime.timedelta(days=1),
                                             aftert=today + datetime.timedelta(days=2)))
             # count = count + 1
             try:
                 temp = r.json()
                 item.phone = temp.get("data").get("hotel").get("tel")
                 item.district = temp.get("data").get("hotel").get("districtName")
                 sess2.commit()
             except AttributeError as e:
                 print(hotel_id, e)
                 if "list" in str(e):
                     continue
                 else:
                     raise e
             print(temp.get("data").get("hotel").get("tel"))
예제 #4
0
 def _p_list(self, url, category):
     time.sleep(0.3)
     if "pn1/" in url:
         url = url[:url.find("pn1/")]
     print("list url {}".format(url))
     self.session.headers["Host"] = "b2b.huangye88.com"
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     form = soup.find("form", {"id": "jubao"})
     mas = form.find_all("dl")
     for dl in mas:
         res = {}
         res["category"] = category
         a1 = dl.find("a", {"rel": "nofollow"})
         if a1:
             res["phone"] = a1.text
         dds = dl.find_all("dd")
         for dd in dds:
             if not dd.has_attr("class"):
                 res["products"] = dd.text[:-4]
         a = dl.find("a", {"itemprop": "name"})
         if a:
             d_u = a.get("href") + "company_detail.html"
             res["enterpriseName"] = a.text
             res["url"] = d_u
             with session_scope() as sess:
                 hy = sess.query(HuangYe).filter(HuangYe.url == d_u).first()
                 if not hy:
                     result = self._detail(d_u)
                     res.update(result)
                     HY = HuangYe(**res)
                     sess.add(HY)
예제 #5
0
 def _plist(self, url, category):
     print("list {}".format(url))
     time.sleep(0.2)
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     div = soup.find("div", class_="company-info")
     lis = div.find_all("li")
     for li in lis:
         res = {}
         res["category"] = category
         tel = li.find("div", class_="tel")
         res["phone"] = tel.text
         a = li.find("a")
         res["enterpriseName"] = a.text
         d_u = "http://hy.taojindi.com" + a.get("href")
         res["url"] = d_u
         div = li.find("div", class_="info")
         res["about"] = div.text
         div = li.find("div", class_="address")
         temp = div.text.split()
         for item in temp:
             if "地址:" in item:
                 res["address"] = item[len("地址:"):]
             elif "主营产品:" in item:
                 res["products"] = item[len("主营产品:"):]
         with session_scope() as sess:
             cns = sess.query(TaoJin).filter(
                 TaoJin.url == res["url"]).first()
             if not cns:
                 resu = self._detail(res["url"])
                 res.update(resu)
                 cn = TaoJin(**res)
                 sess.add(cn)
예제 #6
0
 def _plist(self, url):
     time.sleep(0.2)
     print("list {}".format(url))
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     divt = soup.find("div", class_="left_box")
     divs = divt.find_all("div", class_="list")
     for div in divs:
         res = {}
         lis = div.find_all("li")
         for li in lis:
             a = li.find("a")
             if a:
                 res["url"] = a.get("href")
                 tem = li.text.split()
                 if not tem:
                     continue
                 res["enterpriseName"] = tem[0]
             else:
                 if "主营:" in li.text:
                     res["products"] = li.text[li.text.find("主营:") + 3:]
                 else:
                     res["businessModel"] = li.text.strip()
         td = div.find("td", {"class": "f_orange", "width": "100"})
         if td:
             res["location"] = td.text
         with session_scope() as sess:
             cns = sess.query(MetalInc).filter(
                 MetalInc.url == res["url"]).first()
             if not cns:
                 resu = self._detail(res["url"])
                 res.update(resu)
                 cn = MetalInc(**res)
                 sess.add(cn)
예제 #7
0
def parse_jiehun_item(session, url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        time.sleep(random.uniform(1, 3))
        print("pase jiehun url {}".format(url))
        resu = {}
        jiehun_url = "https://www.meituan.com/jiehun/{}/"
        # session.headers[
        #     "Cookie"] = "__mta=146208011.1562725971505.1562821920182.1562822162903.6; _lxsdk_cuid=16bd9b99c9ec8-035ab9a6954478-36664c08-1fa400-16bd9b99ca0c8; client-id=047f9384-30b4-4cce-aedb-773f7a31fd8a; mtcdn=K; uuid=3b49df191ddb4094bc3c.1562729907.1.0.0; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; ci=45; rvct=45%2C114; IJSESSIONID=vguqtn4rvu70q8m6y7wyaoli; iuuid=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; cityname=%E9%87%8D%E5%BA%86; _lxsdk=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; _hc.v=10962146-cd2f-a7a9-c15a-d942a6e12989.1562744821; __mta=146208011.1562725971505.1562742466866.1562812609604.4; lat=29.535538; lng=106.512486; _lxsdk_s=16bdf56dfcd-a48-5e0-95b%7C%7C18"
        r = session.get(url, timeout=5)
        rule = r'window.AppData = (.+?);</script>'
        slotList = re.findall(rule, r.text)
        if slotList:
            res = json.loads(slotList[0])
            # print(res)
            # if res.get("poiParam").get("uuid"):
            #     session.headers["Cookie"] = cookies.format(res.get("poiParam").get("uuid"))
            shoplist = res.get("searchResult").get("searchResult")
            for item in shoplist:
                resu["score"] = item.get("avgscore")
                resu["shop"] = item.get("title")
                resu["address"] = item.get("address")
                shop_id = item.get("id")
                target = jiehun_url.format(shop_id)
                resu["url"] = target
                res = parse_jiehun_phone(session, target)
                resu.update(res)
                mt = MeiTuanShop(**resu)
                with session_scope() as session1:
                    session1.add(mt)
                if not ess[1] and ess[0]:
                    EsBackends("meituan").update_data(id=ess[2],
                                                      body={
                                                          "link": url,
                                                          "status": 1,
                                                          "date": time.time()
                                                      })
                if not ess[0]:
                    EsBackends("meituan").index_data({
                        "link": url,
                        "status": 1,
                        "date": time.time()
                    })
        else:
            print("获取不到值 {}".format(url))
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 0,
                    "date": time.time()
                })
            else:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 0,
                                                      "date": time.time()
                                                  })
예제 #8
0
 def p_list(self, url, area):
     print("page url {}".format(url))
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     ul = soup.find("ul", class_="list")
     mas = ul.find_all("a")
     for a in mas:
         res = {}
         res["area"] = area
         name = a.get("title")
         res["enterpriseName"] = name
         if not a.get("href").startswith("http"):
             d_u = "https:" + a.get("href")
         else:
             d_u = a.get("href")
         res["url"] = d_u
         temp = a.next_sibling.next_sibling
         font = temp.find("font")
         if font:
             res["updateTime"] = font.text
         with session_scope() as sess:
             wgs = sess.query(BFZY).filter(BFZY.url == d_u).first()
             if not wgs:
                 resu = self.detail(d_u)
                 res.update(resu)
                 wg = BFZY(**res)
                 sess.add(wg)
                 print(res)
예제 #9
0
파일: tuniu.py 프로젝트: 838957471/crawlers
 def get_data(self):
     page = 2
     url = "http://hotel.tuniu.com/ajax/list?search%5BcityCode%5D=300&search%5BcheckInDate%5D=2019-7-30&search%5BcheckOutDate%5D=2019-7-31&search%5Bkeyword%5D=&suggest=&sort%5Bfirst%5D%5Bid%5D=recommend&sort%5Bfirst%5D%5Btype%5D=&sort%5Bsecond%5D=&sort%5Bthird%5D=cash-back-after&page={}&returnFilter=0"
     while True:
         print("the page is {}".format(page))
         r = self.session.get(url.format(page))
         r.encoding = "utf-8"
         # res = json.loads(r.text)
         # print(r.json())
         if not r.json():
             print("now page is {}".format(page))
             break
         data_list = r.json().get("data").get("list")
         for item in data_list:
             res = {}
             res["address"] = item.get("address")
             # print(item.get("name"))
             d_url = "http://hotel.tuniu.com" + item.get("url")
             res["url"] = d_url
             res["shop"] = item.get("name")
             res["score"] = item.get("remarkScore")
             res["price"] = item.get("startPrice")
             res["decorateYear"] = item.get("decorateYear")
             area = json.dumps(item.get("pos"))
             res["area"] = area
             tn = TuNiu(**res)
             with session_scope() as sess:
                 qxc = sess.query(TuNiu).filter(
                     TuNiu.url == res["url"]).first()
                 if not qxc:
                     sess.add(tn)
                     print(res)
         page = page + 1
         time.sleep(0.5)
예제 #10
0
def parse_qianyann_item(url):
    sess.headers[
        "Cookie"] = "UM_distinctid=16bf50a8450476-00b7d0ed2a109b-e343166-1fa400-16bf50a8451895; _gscu_1516296093=631843228bhl3k13; _gscbrs_1516296093=1; Hm_lvt_062d51b4dcc0576135b683257033659a=1563184338; Hm_lpvt_062d51b4dcc0576135b683257033659a=1563242618; _gscs_1516296093=t6324261780strk14|pv:1"
    r = sess.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    table = soup.find_all("table", {
        "cellpadding": "0",
        "cellspacing": "0",
        "width": "98%"
    })
    trs = table[0].find_all("tr")
    for item in trs:
        res = {}
        tds = item.find_all("td")
        title = tds[1].a.text
        publish_date = tds[2].span.text.strip()
        res["title"] = title
        res["publishDate"] = publish_date
        de_url = qianyan_home + tds[1].a.get("href")
        res["url"] = de_url
        ess = es_search("govnews", de_url)
        if ess[0] and ess[1]:
            pass
        else:
            resu = parse_qianyan_detail(de_url)
            res.update(resu)
            gw = GoverNews(**res)
            with session_scope() as sess1:
                sess1.add(gw)
            EsBackends("govnews").index_data({
                "link": de_url,
                "status": 1,
                "date": time.time()
            })
예제 #11
0
 def gov_news(self, url):
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     res = {}
     ul = soup.find("ul", class_="pList01")
     lis = ul.find_all("li")
     for item in lis:
         date = item.span.text
         date = date[1:-1]
         res["publishDate"] = date
         if "http" not in item.a.get("href"):
             new_url = self.home + item.a.get("href")
         else:
             new_url = item.a.get("href")
         ess = es_search("govnews", new_url)
         if ess[0] and ess[1]:
             pass
         else:
             try:
                 resu = self.parse_detail(new_url)
             except Exception as e:
                 print(e)
                 continue
             res.update(resu)
             gw = GoverNews(**res)
             with session_scope() as sess:
                 sess.add(gw)
             EsBackends("govnews").index_data({
                 "link": new_url,
                 "status": 1,
                 "date": time.time()
             })
예제 #12
0
 def final_parse(self, url, count, area):
     r = self.session.get(url.format(count))
     print("kaishi jiexi ")
     soup = BeautifulSoup(r.text, "lxml")
     div = soup.find("div", class_="left_box")
     uls = div.find_all("ul")
     with session_scope() as sess:
         for item in uls:
             res = {}
             res["location"] = area
             a = item.find("a")
             res["enterpriseName"] = a.get("title")
             res["url"] = a.get("href")
             lis = item.find_all("li")
             mb = lis[1].text
             res["primaryBusiness"] = mb.strip()
             if len(lis) > 2:
                 phone = lis[2].text.strip()
                 res["phone"] = phone
             wgs = sess.query(WGQY).filter(WGQY.url == res["url"]).first()
             if not wgs:
                 result = self.parse_detail(res["url"])
                 res.update(result)
                 wg = WGQY(**res)
                 sess.add(wg)
         print("wanbi")
예제 #13
0
def get_hotel_detail(url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        result = {}
        time.sleep(random.uniform(1, 3))
        r = session.get(url)
        print("parse hotel {}".format(url))
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, 'lxml')
        naspan = soup.find("div", {"class": "breadcrumb-nav"})
        result["shop"] = naspan.text.strip()
        result["url"] = url
        result["openTime"] = "全天"
        div = soup.find("div", {"class": "mb10"})
        span = div.find("span")
        result["address"] = span.text.strip()
        li = soup.find("li", {"class": "fs14"})
        divs = li.find_all("div", {"class": "mb10"})
        item = divs[-1]
        if "电话" in item.text:
            phone = item.text[item.text.find(":") + 1:]
            result["phone"] = phone
        score = soup.find("div", {"class": "other-detail-line1-score"})
        result["score"] = score.text.strip()
        mt = MeiTuanShop(**result)
        if result:
            result["url"] = url
            with session_scope() as session1:
                session1.add(mt)
            if not ess[1] and ess[0]:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 1,
                                                      "date": time.time()
                                                  })
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 1,
                    "date": time.time()
                })
        else:
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 0,
                    "date": time.time()
                })
            else:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 0,
                                                      "date": time.time()
                                                  })
            print("获取值为空 {}".format(url))
예제 #14
0
 def parse_jiehun(self, url, area, locate):
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     div = soup.find('div', {"id": "J_boxList"})
     ul = div.find("ul", class_="shop-list")
     lis = ul.find_all("li")
     for item in lis:
         res = {}
         res["cateUrl"] = url
         res["locate"] = locate
         res["area"] = area.strip()
         a = item.a
         if a.get("title"):
             res["shop"] = a.get("title")
         if a.get("href"):
             jiehun_url = self.url_home + a.get("href")
             index = jiehun_url.find("?")
             res["url"] = jiehun_url[:index]
         p = item.find_all("p", class_="area-list")
         if p:
             p = p[0]
             res["address"] = p.text.strip()
         score = item.find("p", class_="remark")
         if score:
             res["score"] = score.span.get("title")
         # r2 = self.session.get(jiehun_url)
         # print(jiehun_url)
         # soup = BeautifulSoup(r2.text, "lxml")
         # div = soup.find("div", class_="offers-box")
         # if not div:
         #     div = soup.find("div", class_="shop-wrap")
         #     if not div:
         #         div = soup.find_all("div", {"id": "J_boxYouhui", "class": "textshow"})[0]
         #         span = div.find_all("span", class_="fl")[0]
         #         if span.get("title"):
         #             res["address"] = span.get("title")
         #         sp = div.find_all("span", class_="icon-phone")[0]
         #         res["phone"] = sp.text
         #     else:
         #         h1 = div.find("h1", class_="shop-title")
         #         span = div.find("span", class_="fl road-addr")
         #         address = span.text.strip()
         #         res["address"] = address
         #         phone = div.find("span", class_="icon-phone")
         #         res["phone"] = " ".join(phone.text.split()).strip()
         # else:
         #     span = div.find("span", class_="info-name")
         #     address = span["title"]
         #     res["address"] = address.strip()
         #     p = div.find("p", class_="expand-info tel")
         #     sp = p.find("span", class_="item")
         #     res["phone"] = " ".join(sp.text.split()).strip()
         dz = DZDianPing(**res)
         with session_scope() as sess:
             qxc = sess.query(DZDianPing).filter(
                 DZDianPing.url == res["url"]).first()
             if not qxc:
                 sess.add(dz)
                 print(res)
예제 #15
0
 def parse_detail(self, url):
     ess = es_search("qiyeminglu", url)
     if not ess[1] or not ess[0]:
         time.sleep(random.uniform(1.5, 2))
         print("parse url {}".format(url))
         r = self.session.get(url)
         soup = BeautifulSoup(r.text, "lxml")
         fs = soup.find("fieldset", {"class": "ad_biger"})
         lis = fs.div.find_all("li")
         res = {}
         for li in lis:
             name = li.find("span", {"class": "field-label"}).text.strip()
             value = li.find("span", {"class": "field-item"}).text.strip()
             if "点击" in value:
                 index = value.find("点击")
                 value = value[:index - 1]
             if "统一社会信用" in name:
                 value = re.findall(patern, value)[0]
                 res["socialCreditCode"] = value
             if "名称" in name:
                 res["enterpriseName"] = value
             if "地址" in name:
                 res["address"] = value
             if "地区" in name:
                 res["area"] = value.strip()
             if "日期" in name:
                 res["registerDate"] = value
             if "范围" in name:
                 res["businessScope"] = value
             if "代表人" in name:
                 res["legalRepresentative"] = value
             if "资金" in name:
                 res["registeredFunds"] = value
             if "类型" in name:
                 if value:
                     res["enterpriseType"] = value
                 else:
                     value = lis[-1].find("span", {
                         "class": "field-item"
                     }).span
                     if value:
                         res["enterpriseType"] = value.text.strip()
                         print(value.text)
         ecq = EnterpriseCq(**res)
         with session_scope() as session1:
             session1.add(ecq)
         if not ess[0]:
             EsBackends("qiyeminglu").index_data({
                 "link": url,
                 "status": 1,
                 "date": time.time()
             })
         else:
             EsBackends("qiyeminglu").update_data({
                 "link": url,
                 "status": 1,
                 "date": time.time()
             })
예제 #16
0
def index_es():
    with session_scope() as session:
        books = session.query(Book).all()
        for book in books:
            data = {}
            data["title"] = book.title
            data["author"] = book.author_name
            EsBackends("crawled_books", "bookinfo").index_data(data)
            print('the {} is insert'.format(book.title))
예제 #17
0
def fix_funds():
    with session_scope() as sess:
        na = sess.query(BFZY).filter(BFZY.registeredFunds == None).all()
        for item1 in na:
            if item1.about:
                tem = item1.about.split(";")
                for item in tem:
                    t = item.split(":")
                    if "注册资金" == t[0]:
                        item1.registeredFunds = t[1]
예제 #18
0
def jietu():
    with session_scope() as sess:
        dz = sess.query(DZDianPing).filter(DZDianPing.phone == None).all()
        for item in dz:
            url = item.url
            file_name = url.split("/")[-1]
            phone_file = 'E:\\images\\{}.png'.format(file_name + "phone")
            if not os.path.exists(phone_file):
                sa = cut_imge(item.url)
                time.sleep(2)
예제 #19
0
def tuniutongbu():
    with session_scope() as sess:
        with session_scope_remote() as sess_remote:
            na = sess.query(TuNiuAll).filter(
                or_(TuNiuAll.phone != None, TuNiuAll.district != None)).all()
            for item in na:
                ture = sess_remote.query(TuNiuAll).filter(
                    TuNiuAll.url == item.url).first()
                if not ture.phone and not ture.district:
                    ture.phone = item.phone
                    ture.district = item.district
                    sess_remote.commit()
예제 #20
0
 def get_all_data(self, cid, city):
     page = 1
     today = datetime.date.today()
     while True:
         if city == self.current_city and page == self.current_page:
             self.status = True
         if not self.status:
             page = page + 1
             print("tiaoguo {} {}".format(city, page))
             continue
         print("the page is {}".format(page))
         r = self.session.get(self.url.format(cid=cid, page=page, tomorrow=today + datetime.timedelta(days=1),
                                              aftert=today + datetime.timedelta(days=2)))
         r.encoding = "utf-8"
         temp = r.json().get("data")
         try:
             total = temp.get("total")
         except Exception as e:
             end_time = time.time()
             if end_time - self.start_time > 100:
                 with open("config.json", "r+", encoding="utf-8") as f:
                     lj = json.load(f)
                 with open("config.json", "w", encoding="utf-8") as f:
                     lj["current_page"] = page
                     lj["current_city"] = city
                     json.dump(lj, f)
             raise e
         if not total:
             print("now page is {}".format(page))
             break
         data_list = temp.get("list")
         for item in data_list:
             res = {}
             res["city"] = city
             res["address"] = item.get("address")
             # print(item.get("name"))
             d_url = "http://hotel.tuniu.com" + item.get("url")
             res["url"] = d_url
             res["shop"] = item.get("name")
             res["score"] = item.get("remarkScore")
             res["price"] = item.get("startPrice")
             res["decorateYear"] = item.get("decorateYear")
             area = json.dumps(item.get("pos"))
             res["area"] = area
             tn = TuNiuAll(**res)
             with session_scope() as sess:
                 qxc = sess.query(TuNiuAll).filter(TuNiuAll.url == res["url"]).first()
                 if not qxc:
                     sess.add(tn)
                     print(res)
         page = page + 1
예제 #21
0
def get_products(driver, url):
    driver.get(url)
    r = driver.page_source
    soup = BeautifulSoup(r, "lxml")
    all_li = soup.find_all('li', {"class": "gl-item"})
    for item in all_li:
        try:
            name = item.find("div", {"class": "p-name"})
            name = name.text.strip()
            image = item.find("img")
            if image.get("data-lazy-img"):
                image = "https:" + image.get("data-lazy-img")
            else:
                image = "https:" + image.get("src")
            shop = item.find("div", {"class": "p-shop"})
            shop = shop.find("a")
            if shop:
                shop = shop.get("title")
                shop = shop.strip()
            else:
                shop = ""
            price = item.find("div", {"class": "p-price"})
            price = price.find("strong", {"class": "J_price"})
            price = price.text.strip()
            about = item.find("div", {"class": "p-commit p-commit-n"})
            if not about:
                about = item.find("div", {"class": "p-commit"})
            about = about.text.strip()
            detail_url = item.find("a", {"target": "_blank"})
            detail_url = "https:" + detail_url.get("href")
            res = {
                "id": None,
                "productName": name,
                "image": image,
                "shop": shop,
                "price": price,
                "popular": about,
            }
            try:
                result = get_comment(detail_url)
            except:
                continue
            res.update(result)
            jd = JingDong(**res)
            with session_scope() as session:
                session.add(jd)
        except:
            print("货品条目错误")
            print(traceback.print_exc())
            continue
예제 #22
0
def incre_table():
    res = cate_url()
    cc = 1
    rr = 15
    for item in res.keys():
        for dd in res[item].keys():
            with session_scope() as session:
                d = BookCategory(id=cc,
                                 category_name=dd,
                                 site_id=9,
                                 site_category_id=cc,
                                 category_id=rr)
                session.add(d)
                cc = cc + 1
                rr = rr + 1
예제 #23
0
def delete_book_id():
    session1 = session_sql()
    books = session1.query(BookSource).filter_by(site_id=9).all()
    # session1.execute()
    session1.close()
    for book in books:
        session2 = session_sql()
        # session2.query(Bookchapter).filter(Bookchapter.book_id == book.book_id).delete(synchronize_session=False)
        bs = session2.query(Bookchapter).filter_by(id=book.book_id).all()
        session2.commit()
        books1 = session2.query(Book).filter_by(id=book.book_id).all()
        session2.close()
        for b_chapter in bs:
            if b_chapter:
                with session_scope() as session3:
                    session3.delete(b_chapter)
                    print('delete ', b_chapter.title)
        for book1 in books1:
            if book1:
                with session_scope() as session4:
                    session4.delete(book1)
                    print('delete book', book1.id)
        with session_scope() as session5:
            session5.delete(book)
예제 #24
0
파일: tuniu.py 프로젝트: 838957471/crawlers
 def get_phone(self):
     url = "http://hotel.tuniu.com/ajax/getHotelStaticInfo?id={}&checkindate=2019-07-31&checkoutdate=2019-08-01"
     # count = 1
     with session_scope() as sess1:
         tn = sess1.query(TuNiu).filter(TuNiu.phone == None).all()
         for item in tn:
             hotel_id = item.url.split("/")[-1].strip()
             r = self.session.get(url.format(hotel_id))
             # count = count + 1
             temp = r.json()
             item.phone = temp.get("data").get("hotel").get("tel")
             item.district = temp.get("data").get("hotel").get(
                 "districtName")
             sess1.commit()
             print(temp.get("data").get("hotel").get("tel"))
             time.sleep(0.5)
예제 #25
0
def dump_table():
    session1 = session_sql1()
    ss = session1.query(BookCategory).all()
    session1.close()
    for s1 in ss:
        d = BookCategory(id=s1.id,
                         category_major=s1.category_major,
                         category_min=s1.category_min,
                         male_female=s1.male_female,
                         sort=s1.sort,
                         time_created=s1.time_created,
                         status=s1.status,
                         cover=s1.status,
                         cate_id=s1.cate_id)
        with session_scope() as session2:
            session2.add(d)
예제 #26
0
def cate_table():
    with session_scope() as session:
        i = 1
        for items in category(_cate):
            if items[1]:
                for item in items[1]:
                    b = BookCategory(id=None,
                                     category_major=item[0],
                                     category_min=item[1],
                                     male_female=items[0],
                                     sort=i,
                                     time_created=round(time.time()),
                                     status=1,
                                     cover='')
                    i = i + 1
                    session.add(b)
예제 #27
0
 def _p_list(self, url, category):
     time.sleep(0.2)
     print("list url {}".format(url))
     result = {}
     result["category"] = category
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     tables = soup.find_all("table", {
         "width": "980",
         "border": "0",
         "cellspacing": "0",
         "cellpadding": "0"
     })
     res = OrderedDict()
     for table in tables:
         if "企业录推荐企业信息" in table.text:
             table = table.next_sibling.next_sibling
             trs = table.find_all("tr")
             for tr in trs:
                 tds = tr.find_all("td")
                 for td in tds:
                     ma = td.find_all("a", {"target": "_blank"})
                     for a in ma:
                         if not a.get("title"):
                             detail = (a.text, a.get("href"))
                         else:
                             products = a.text
                     font = td.find("font", {"color": "#666666"})
                     entype = font.text
                     res[detail[1]] = (detail[0], products, entype)
     for key, value in res.items():
         result["enterpriseName"] = value[0]
         result["products"] = value[1]
         result["businessModel"] = value[2]
         result["url"] = key
         if self.jump in key:
             self.status = True
         if not self.status:
             continue
         with session_scope() as sess:
             qy = sess.query(QYLu).filter(QYLu.url == key).first()
             if not qy:
                 resu = self._detail(key)
                 result.update(resu)
                 sou = QYLu(**result)
                 sess.add(sou)
예제 #28
0
 def _list_item(self, url, category, location):
     time.sleep(0.6)
     print("list url {}".format(url))
     self.session.headers["Referer"] = url[:url.find(
         "/p")] + "/p{}/".format(self.index_page(url) - 1)
     r = self.session.get(url)
     r.encoding = "utf-8"
     soup = BeautifulSoup(r.text, "lxml")
     div = soup.find("div", class_="hy_lbox fl mt2")
     div = div.find("div", class_="hy_companylist")
     lis = div.find_all("li")
     for li in lis:
         res = {}
         a = li.find("a")
         if a.get("href").startswith("http"):
             d_u = a.get("href")
         else:
             d_u = "http:" + a.get("href")
         res["url"] = d_u
         res["category"] = category
         res["location"] = location
         res["enterpriseName"] = a.text
         span = li.find("span", class_="tel")
         if hasattr(span, "text"):
             res["phone"] = span.text
         dds = li.find_all("dd")
         temp = []
         for dd in dds:
             temp.append(dd.text)
         ss = "地址:"
         ss1 = "主营产品:"
         for item in temp:
             if ss in item:
                 res["address"] = item[len(ss):]
             elif ss1 in item:
                 res["products"] = item[len(ss1):]
         with session_scope() as sess:
             soule = sess.query(SouLeWang).filter(
                 SouLeWang.url == d_u).first()
             if not soule:
                 resu = self._detail(d_u)
                 res.update(resu)
                 sou = SouLeWang(**res)
                 sess.add(sou)
예제 #29
0
def table_back():
    atts = globals()
    for item1 in datatable:
        table = atts.get(item1)
        count = 0
        session_remote = session_sql_remote()
        with session_scope() as sess1:
            ms = sess1.query(table).filter().all()
            for item in ms:
                print(item.__dict__)
                temp = item.__dict__
                temp["id"] = None
                temp.pop("_sa_instance_state")
                ta = table(**temp)
                count = count + 1
                session_remote.add(ta)
                if count % 5000 == 0:
                    session_remote.commit()
            session_remote.commit()
예제 #30
0
 def _p_list(self, url, category):
     time.sleep(1)
     print("list {}".format(url))
     res = {}
     res["category"] = category
     self.session.headers["Host"] = "www.cntrades.com"
     self.session.headers["Cookie"] = self.cookies.format(
         int(time.time()) - 1,
         int(time.time()) - 2)
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     divv = soup.find("div", class_="left_box")
     if not divv:
         if "404 Not Found" in r.text:
             raise Exception("404")
         if "是不是太快了?" in r.text:
             raise Exception("too fast")
     divs = divv.find_all("div", class_="list")
     for div in divs:
         lis = div.find_all("li")
         for li in lis:
             a = li.find("a")
             if a:
                 res["url"] = a.get("href")
                 tem = li.text.split()
                 res["enterpriseName"] = tem[0]
                 res["businessModel"] = tem[-1]
             else:
                 if "主营:" in li.text:
                     res["products"] = li.text[li.text.find("主营:") + 3:]
                 else:
                     res["address"] = li.text
         td = div.find("td", {"class": "f_orange", "width": "100"})
         if td:
             res["location"] = td.text
         with session_scope() as sess:
             cns = sess.query(CnTrade).filter(
                 CnTrade.url == res["url"]).first()
             if not cns:
                 resu = self._detail(res["url"])
                 res.update(resu)
                 cn = CnTrade(**res)
                 sess.add(cn)