def parse_list_page(self, source): # 每个职位的链接 tree = etree.HTML(source) list_li = tree.xpath('//*[@id="main"]/div/div[2]/ul/li') print("type: %s, size: %d" % (type(list_li), len(list_li))) for li in list_li: # 职位 post = li.xpath("./div/div/div/div/div/span/a/text()") # 薪酬 salary = li.xpath("./div/div/div/div/div[2]/span/text()") # 条件 condition = li.xpath("./div/div/div/div/div[2]/p/text()") # 工龄 working_age = condition[0] # 教育 education = condition[1] # company 公司 company = li.xpath("./div/div/div[2]/div/h3/a/text()") print( "post: %s, salary: %s, working_age: %s, education: %s, company:%s " % (post[0], salary[0], working_age, education, company[0])) boss_item = { "post": post, "salary": salary, "condition": condition, "working_age": working_age, "education": education, "company": company } save_item(boss_item) pass pass
def parse_list(response): soup = bs(response.content.decode('utf-8')) for el in list(soup.select('#b_results > li')): try: save_item({ 'title': el.select_one('h2').text, 'url': el.select_one('h2 a').attrs.get('href'), 'abstract': el.select_one('.b_caption p').text, }) except: pass
def parse_data(self): while True: try: url = self.r2.get_page_url("上海") print(url) except: break headers = self.headers_fordata headers["Referer"] = url html = requests.get(url=url, headers=headers) res = etree.HTML(html.text) try: outName = res.xpath( '/html/body/div[3]/div[1]/div[2]/div[4]/text()')[0] phone = res.xpath( '/html/body/div[3]/div[1]/div[2]/div[6]/span/text()')[0] companyName = res.xpath( '/html/body/div[3]/div[1]/div[1]/h2/text()')[0] except: continue if is_phone(phone): if "企业管理" not in str(companyName): print(companyName) item = {} item['companyCity'] = "成都" item['companyProvince'] = "四川省" item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = companyName item["outName"] = outName item["resourceRemark"] = '' item["companyTel"] = str(phone) item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) # item["flag"] = 0 print(item) self.m.mongo_add(item) save_item(item) else: continue
def test_save_item(self): for i in range(10): save_item({'url': url, 'title': str(i)}) dedup_field = get_dedup_field() table_name = get_collection() conn = get_conn() cursor = conn.cursor() cursor.execute( f'SELECT count(*) FROM {table_name} WHERE {dedup_field} = \'{url}\'' ) conn.commit() res = cursor.fetchone() assert res[0] == 1 cursor.execute( f'SELECT url,title FROM {table_name} WHERE {dedup_field} = \'{url}\'' ) conn.commit() res = cursor.fetchone() assert res[1] == '9' cursor.execute( f'DELETE FROM {table_name} WHERE {dedup_field} = \'{url}\'') conn.commit() cursor.close()
def save_data(self, data_list): """数据入库""" for data in data_list: save_item(data) print("save data", data)
def process_item(self, item, spider): result = Result(item) result.set_task_id(get_task_id()) save_item(result) return item
def save_data(self, data_list): for data in data_list: print(data) save_item(data)
def parse_sub(self, response): city = response.meta['city'] le = LinkExtractor(allow=r'http://www.gckzw.com/detail-\d+.html') links = le.extract_links(response) link_next = response.selector.xpath( '//div[@class="pager_divider travel_celarfix"]/a[' '@class="prev_btn"]/@href').get() time.sleep(10) if link_next: yield scrapy.Request(response.urljoin(link_next), callback=self.parse_sub, meta={'city': city}) if links: item = NationalSimJdItem() prices = response.selector.xpath( '//span[@class="price_num"]/text()').extract() comments = response.selector.xpath( '//span[@class="evaluate_num"]/text()').extract() introductions = response.selector.xpath( '//div[@class="travel_hotel_intro_intro"]/text()').extract() addresss = response.selector.xpath( '//p[@class="travel_hotel_intro_address"]/text()').extract() province = None seq = None for index, pcs in enumerate(province_city.values()): for pc in pcs: if city == pc.split(',', 1)[0]: province = list(province_city.keys())[index] seq = pc for link, price, comment, introduction, address in zip( links, prices, comments, introductions, addresss): item['name'] = link.text hotel_detail = {} hotel_detail['seq'] = seq hotel_detail['国家'] = '中国' hotel_detail['省份'] = province hotel_detail['城市'] = city hotel_detail['商圈'] = re.sub(string=address, pattern="地址:", repl='') hotel_detail['是否为客栈'] = random.choice( ['0', '1', '1', '0', '0', '1']) hotel_detail['酒店星级'] = random.randint(1, 6) hotel_detail['业务部门'] = random.randint(0, 5) hotel_detail['剩余房间'] = random.randint(0, 30) hotel_detail['图片数'] = random.randint(0, 15) hotel_detail['酒店评分'] = random.randint(1, 10) hotel_detail['用户点评数'] = comment hotel_detail['城市平均实住间夜'] = random.uniform(45, 60) hotel_detail['酒店总订单'] = random.randrange(100, 400) hotel_detail['酒店总间夜'] = random.randrange( hotel_detail['酒店总订单'] - 50, hotel_detail['酒店总订单']) hotel_detail['酒店实住订单'] = random.randrange( hotel_detail['酒店总订单'] - 50, hotel_detail['酒店总订单']) hotel_detail['酒店实住间夜'] = random.randrange( hotel_detail['酒店总间夜'] - 40, hotel_detail['酒店总间夜']) hotel_detail['酒店直销订单'] = random.randrange( int(hotel_detail['酒店总订单'] / 2), hotel_detail['酒店总订单']) hotel_detail['酒店直销间夜'] = random.randrange( int(hotel_detail['酒店总间夜'] / 2), hotel_detail['酒店总间夜']) hotel_detail['酒店直销实住订单'] = random.randrange( hotel_detail['酒店实住间夜'] - 10, hotel_detail['酒店实住间夜']) hotel_detail['酒店直销实住间夜'] = random.randrange( hotel_detail['酒店直销间夜'] - 10, hotel_detail['酒店直销间夜']) hotel_detail['酒店直销拒单'] = hotel_detail['酒店直销订单'] % 10 hotel_detail['酒店直销拒单率'] = random.uniform( 0, hotel_detail['酒店直销拒单'] / 100) hotel_detail['城市直销拒单率'] = random.uniform( 0, hotel_detail['酒店直销拒单'] / 100) hotel_detail['拒单率是否小于等于直销城市均值'] = hotel_detail[ '酒店直销拒单率'] <= hotel_detail['城市直销拒单率'] hotel_detail['最低房间价格'] = price hotel_detail['简介'] = re.sub(string=introduction, pattern="\\s+|简介:", repl='') hotel_detail['酒店链接'] = link.url item['detail'] = hotel_detail print('获取成功') save_item(item) yield item
def get_list(self): url = "https://www.amazon.com/s?k=%E6%8A%95%E5%BD%B1%E4%BB%AA&i=deals-intl-ship&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2" headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "max-age=0", "cookie": 'session-id=132-0147845-7026244; session-id-time=2082787201l; sp-cdn="L5Z9:HK"; skin=noskin; ubid-main=130-2268314-5653533; x-wl-uid=1BZcM1Xxkjc3RhKfgiIa11jfjof88AhAJYz1T18X8GghTikXD6B3Ls+3gnOduE9zfhaerZwFLcUA=; session-token=1mjBahMcQH2PKdNRs3+kVNDR9Mo3m+o1kAk5lDb3mNDv0UsyIlJswEyo0AQnq6D7krqktVu6td1QY92cLGC0tM0RTk1aHAUUoVsTjJYGTbuioBjFFvnd7HT//xEhZCwHSfhv73QamKe3jljcZ7q3XBefqcBcTkd0bX+/qFGokpaQngCwOFkWptTdKh+3W9xNEAgvK4hrBxX98cq+FVTmfumsAbsRAhLo26gzQZ6lVMmZUE7lIMlZ80adcQQj+wkNkNsE+tWKQpQ=; lc-main=zh_CN; i18n-prefs=USD; csm-hit=tb:3W2R1T4Y9TQCQ8A5R033+s-RFDGE3KCD60HNQ6PXG28|1594636706586&t:1594636706586&adb:adblk_no', "downlink": "10", "ect": "4g", "referer": "https://www.amazon.com/international-sales-offers/b/ref=gbps_fcr_m-9_475e_wht_1064954?currency=USD&language=zh_CN&node=15529609011&gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,sortOrder:BY_SCORE,MARKETING_ID:ship_export,enforcedCategories:679255011&pf_rd_p=5d86def2-ec10-4364-9008-8fbccf30475e&pf_rd_s=merchandised-search-9&pf_rd_t=101&pf_rd_i=15529609011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=0MGJEHJDMNK7AZKN01CR&ie=UTF8", "rtt": "150", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } try: res = requests.get(url=url, headers=headers) print("res.status_code: %d" % res.status_code) if res.status_code == 200: tree = etree.HTML(res.text) list_li = tree.xpath( '//div[@data-component-type="s-search-result"]') print("type: %s, size: %d" % (type(list_li), len(list_li))) for item in list_li: # print("item: %s" % item) index = item.xpath("@data-index")[0] asin = item.xpath("@data-asin")[0] uuid = item.xpath("@data-uuid")[0] print("index: %s, asin: %s, uuid: %s" % (index, asin, uuid)) # 描述 description = item.xpath( './div/span/div/div/div[2]/div[2]/div/div/div/div/div/h2/a/span/text()' )[0] print("description: %s" % description) # 星星 star = item.xpath( './div/span/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/span/span/a/i/span/text()' )[0] print("star: %s" % star) # 总星星 sum_star = item.xpath( './div/span/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/span[2]/a/span/text()' )[0] print("sum_star: %s" % sum_star) # 现价 now_price = "" try: now_price = item.xpath( './div/span/div/div/div[2]/div[2]/div/div[2]/div/div/div/div[2]/div/a/span/span/text()' )[0] except Exception as e: now_price = "未报价" print("now_price: %s" % now_price) # 原价 org_price = "" try: org_price = item.xpath( './div/span/div/div/div[2]/div[2]/div/div[2]/div/div/div/div[2]/div/a/span[2]/span/text()' )[0] except Exception as e: org_price = "没有原价" print("org_price: %s" % org_price) pass print() save_params = { "index": index, "asin": asin, "uuid": uuid, "desc": description, "star": star, "sum_star": sum_star, "now_price": now_price, "org_price": org_price } # 保存数据 save_item(save_params) pass else: print("请求失败! status_code: %d" % res.status_code) except Exception as e: print("error: %s" % e) finally: pass pass
def save_data(self, data): save_item(data)
def test_save_item(self): save_item(self.basic_item)