예제 #1
0
    def parse_list_page(self, source):
        # 每个职位的链接
        tree = etree.HTML(source)
        list_li = tree.xpath('//*[@id="main"]/div/div[2]/ul/li')
        print("type: %s, size: %d" % (type(list_li), len(list_li)))
        for li in list_li:
            # 职位
            post = li.xpath("./div/div/div/div/div/span/a/text()")
            # 薪酬
            salary = li.xpath("./div/div/div/div/div[2]/span/text()")
            # 条件
            condition = li.xpath("./div/div/div/div/div[2]/p/text()")
            # 工龄
            working_age = condition[0]
            # 教育
            education = condition[1]
            # company 公司
            company = li.xpath("./div/div/div[2]/div/h3/a/text()")
            print(
                "post: %s, salary: %s, working_age: %s, education: %s, company:%s "
                % (post[0], salary[0], working_age, education, company[0]))

            boss_item = {
                "post": post,
                "salary": salary,
                "condition": condition,
                "working_age": working_age,
                "education": education,
                "company": company
            }
            save_item(boss_item)
            pass

        pass
예제 #2
0
파일: bing_spider.py 프로젝트: xywc-s/cl
def parse_list(response):
    soup = bs(response.content.decode('utf-8'))
    for el in list(soup.select('#b_results > li')):
        try:
            save_item({
                'title': el.select_one('h2').text,
                'url': el.select_one('h2 a').attrs.get('href'),
                'abstract': el.select_one('.b_caption p').text,
            })
        except:
            pass
예제 #3
0
 def parse_data(self):
     while True:
         try:
             url = self.r2.get_page_url("上海")
             print(url)
         except:
             break
         headers = self.headers_fordata
         headers["Referer"] = url
         html = requests.get(url=url, headers=headers)
         res = etree.HTML(html.text)
         try:
             outName = res.xpath(
                 '/html/body/div[3]/div[1]/div[2]/div[4]/text()')[0]
             phone = res.xpath(
                 '/html/body/div[3]/div[1]/div[2]/div[6]/span/text()')[0]
             companyName = res.xpath(
                 '/html/body/div[3]/div[1]/div[1]/h2/text()')[0]
         except:
             continue
         if is_phone(phone):
             if "企业管理" not in str(companyName):
                 print(companyName)
                 item = {}
                 item['companyCity'] = "成都"
                 item['companyProvince'] = "四川省"
                 item['code'] = 'BUS_YT_ZZ'
                 item['name'] = '资质'
                 item['busCode'] = ''
                 item['webUrl'] = '无'
                 item['orgId'] = ''
                 item['deptId'] = ''
                 item['centreId'] = ''
                 item["companyName"] = companyName
                 item["outName"] = outName
                 item["resourceRemark"] = ''
                 item["companyTel"] = str(phone)
                 item["ibossNum"] = None
                 item['isDir'] = 0
                 item['isShare'] = 0
                 item["_id"] = md5encryption(item["companyTel"])
                 # item["flag"] = 0
                 print(item)
                 self.m.mongo_add(item)
                 save_item(item)
         else:
             continue
예제 #4
0
 def test_save_item(self):
     for i in range(10):
         save_item({'url': url, 'title': str(i)})
     dedup_field = get_dedup_field()
     table_name = get_collection()
     conn = get_conn()
     cursor = conn.cursor()
     cursor.execute(
         f'SELECT count(*) FROM {table_name} WHERE {dedup_field} = \'{url}\''
     )
     conn.commit()
     res = cursor.fetchone()
     assert res[0] == 1
     cursor.execute(
         f'SELECT url,title FROM {table_name} WHERE {dedup_field} = \'{url}\''
     )
     conn.commit()
     res = cursor.fetchone()
     assert res[1] == '9'
     cursor.execute(
         f'DELETE FROM {table_name} WHERE {dedup_field} = \'{url}\'')
     conn.commit()
     cursor.close()
예제 #5
0
 def save_data(self, data_list):
     """数据入库"""
     for data in data_list:
         save_item(data)
         print("save data", data)
예제 #6
0
    def process_item(self, item, spider):
        result = Result(item)
        result.set_task_id(get_task_id())
        save_item(result)

        return item
예제 #7
0
 def save_data(self, data_list):
     for data in data_list:
         print(data)
         save_item(data)
예제 #8
0
    def parse_sub(self, response):
        city = response.meta['city']
        le = LinkExtractor(allow=r'http://www.gckzw.com/detail-\d+.html')
        links = le.extract_links(response)
        link_next = response.selector.xpath(
            '//div[@class="pager_divider travel_celarfix"]/a['
            '@class="prev_btn"]/@href').get()
        time.sleep(10)
        if link_next:
            yield scrapy.Request(response.urljoin(link_next),
                                 callback=self.parse_sub,
                                 meta={'city': city})
        if links:
            item = NationalSimJdItem()
            prices = response.selector.xpath(
                '//span[@class="price_num"]/text()').extract()
            comments = response.selector.xpath(
                '//span[@class="evaluate_num"]/text()').extract()
            introductions = response.selector.xpath(
                '//div[@class="travel_hotel_intro_intro"]/text()').extract()
            addresss = response.selector.xpath(
                '//p[@class="travel_hotel_intro_address"]/text()').extract()
            province = None
            seq = None
            for index, pcs in enumerate(province_city.values()):
                for pc in pcs:
                    if city == pc.split(',', 1)[0]:
                        province = list(province_city.keys())[index]
                        seq = pc

            for link, price, comment, introduction, address in zip(
                    links, prices, comments, introductions, addresss):
                item['name'] = link.text
                hotel_detail = {}
                hotel_detail['seq'] = seq
                hotel_detail['国家'] = '中国'
                hotel_detail['省份'] = province
                hotel_detail['城市'] = city
                hotel_detail['商圈'] = re.sub(string=address,
                                            pattern="地址:",
                                            repl='')
                hotel_detail['是否为客栈'] = random.choice(
                    ['0', '1', '1', '0', '0', '1'])
                hotel_detail['酒店星级'] = random.randint(1, 6)
                hotel_detail['业务部门'] = random.randint(0, 5)
                hotel_detail['剩余房间'] = random.randint(0, 30)
                hotel_detail['图片数'] = random.randint(0, 15)
                hotel_detail['酒店评分'] = random.randint(1, 10)
                hotel_detail['用户点评数'] = comment
                hotel_detail['城市平均实住间夜'] = random.uniform(45, 60)
                hotel_detail['酒店总订单'] = random.randrange(100, 400)
                hotel_detail['酒店总间夜'] = random.randrange(
                    hotel_detail['酒店总订单'] - 50, hotel_detail['酒店总订单'])
                hotel_detail['酒店实住订单'] = random.randrange(
                    hotel_detail['酒店总订单'] - 50, hotel_detail['酒店总订单'])
                hotel_detail['酒店实住间夜'] = random.randrange(
                    hotel_detail['酒店总间夜'] - 40, hotel_detail['酒店总间夜'])
                hotel_detail['酒店直销订单'] = random.randrange(
                    int(hotel_detail['酒店总订单'] / 2), hotel_detail['酒店总订单'])
                hotel_detail['酒店直销间夜'] = random.randrange(
                    int(hotel_detail['酒店总间夜'] / 2), hotel_detail['酒店总间夜'])
                hotel_detail['酒店直销实住订单'] = random.randrange(
                    hotel_detail['酒店实住间夜'] - 10, hotel_detail['酒店实住间夜'])
                hotel_detail['酒店直销实住间夜'] = random.randrange(
                    hotel_detail['酒店直销间夜'] - 10, hotel_detail['酒店直销间夜'])
                hotel_detail['酒店直销拒单'] = hotel_detail['酒店直销订单'] % 10
                hotel_detail['酒店直销拒单率'] = random.uniform(
                    0, hotel_detail['酒店直销拒单'] / 100)
                hotel_detail['城市直销拒单率'] = random.uniform(
                    0, hotel_detail['酒店直销拒单'] / 100)
                hotel_detail['拒单率是否小于等于直销城市均值'] = hotel_detail[
                    '酒店直销拒单率'] <= hotel_detail['城市直销拒单率']
                hotel_detail['最低房间价格'] = price
                hotel_detail['简介'] = re.sub(string=introduction,
                                            pattern="\\s+|简介:",
                                            repl='')
                hotel_detail['酒店链接'] = link.url
                item['detail'] = hotel_detail
                print('获取成功')
                save_item(item)
                yield item
예제 #9
0
    def get_list(self):

        url = "https://www.amazon.com/s?k=%E6%8A%95%E5%BD%B1%E4%BB%AA&i=deals-intl-ship&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2"

        headers = {
            "accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding":
            "gzip, deflate, br",
            "accept-language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "cache-control":
            "max-age=0",
            "cookie":
            'session-id=132-0147845-7026244; session-id-time=2082787201l; sp-cdn="L5Z9:HK"; skin=noskin; ubid-main=130-2268314-5653533; x-wl-uid=1BZcM1Xxkjc3RhKfgiIa11jfjof88AhAJYz1T18X8GghTikXD6B3Ls+3gnOduE9zfhaerZwFLcUA=; session-token=1mjBahMcQH2PKdNRs3+kVNDR9Mo3m+o1kAk5lDb3mNDv0UsyIlJswEyo0AQnq6D7krqktVu6td1QY92cLGC0tM0RTk1aHAUUoVsTjJYGTbuioBjFFvnd7HT//xEhZCwHSfhv73QamKe3jljcZ7q3XBefqcBcTkd0bX+/qFGokpaQngCwOFkWptTdKh+3W9xNEAgvK4hrBxX98cq+FVTmfumsAbsRAhLo26gzQZ6lVMmZUE7lIMlZ80adcQQj+wkNkNsE+tWKQpQ=; lc-main=zh_CN; i18n-prefs=USD; csm-hit=tb:3W2R1T4Y9TQCQ8A5R033+s-RFDGE3KCD60HNQ6PXG28|1594636706586&t:1594636706586&adb:adblk_no',
            "downlink":
            "10",
            "ect":
            "4g",
            "referer":
            "https://www.amazon.com/international-sales-offers/b/ref=gbps_fcr_m-9_475e_wht_1064954?currency=USD&language=zh_CN&node=15529609011&gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,sortOrder:BY_SCORE,MARKETING_ID:ship_export,enforcedCategories:679255011&pf_rd_p=5d86def2-ec10-4364-9008-8fbccf30475e&pf_rd_s=merchandised-search-9&pf_rd_t=101&pf_rd_i=15529609011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=0MGJEHJDMNK7AZKN01CR&ie=UTF8",
            "rtt":
            "150",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "user-agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
        }

        try:
            res = requests.get(url=url, headers=headers)
            print("res.status_code: %d" % res.status_code)
            if res.status_code == 200:
                tree = etree.HTML(res.text)
                list_li = tree.xpath(
                    '//div[@data-component-type="s-search-result"]')
                print("type: %s, size: %d" % (type(list_li), len(list_li)))
                for item in list_li:
                    # print("item: %s" % item)
                    index = item.xpath("@data-index")[0]
                    asin = item.xpath("@data-asin")[0]
                    uuid = item.xpath("@data-uuid")[0]
                    print("index: %s, asin: %s, uuid: %s" %
                          (index, asin, uuid))

                    # 描述
                    description = item.xpath(
                        './div/span/div/div/div[2]/div[2]/div/div/div/div/div/h2/a/span/text()'
                    )[0]
                    print("description: %s" % description)

                    # 星星
                    star = item.xpath(
                        './div/span/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/span/span/a/i/span/text()'
                    )[0]
                    print("star: %s" % star)

                    # 总星星
                    sum_star = item.xpath(
                        './div/span/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/span[2]/a/span/text()'
                    )[0]
                    print("sum_star: %s" % sum_star)

                    # 现价
                    now_price = ""
                    try:
                        now_price = item.xpath(
                            './div/span/div/div/div[2]/div[2]/div/div[2]/div/div/div/div[2]/div/a/span/span/text()'
                        )[0]
                    except Exception as e:
                        now_price = "未报价"

                    print("now_price: %s" % now_price)

                    # 原价
                    org_price = ""
                    try:
                        org_price = item.xpath(
                            './div/span/div/div/div[2]/div[2]/div/div[2]/div/div/div/div[2]/div/a/span[2]/span/text()'
                        )[0]
                    except Exception as e:
                        org_price = "没有原价"

                    print("org_price: %s" % org_price)
                    pass

                    print()
                    save_params = {
                        "index": index,
                        "asin": asin,
                        "uuid": uuid,
                        "desc": description,
                        "star": star,
                        "sum_star": sum_star,
                        "now_price": now_price,
                        "org_price": org_price
                    }
                    # 保存数据
                    save_item(save_params)
                pass
            else:
                print("请求失败! status_code: %d" % res.status_code)
        except Exception as e:
            print("error: %s" % e)
        finally:
            pass

        pass
예제 #10
0
 def save_data(self, data):
     save_item(data)
예제 #11
0
 def test_save_item(self):
     save_item(self.basic_item)