def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) if not response: return None selector = etree.HTML(response) lis = selector.xpath("//div[@class='main']/div[@id='left']/ul[@class='app-list']//li") return lis
def loop_request(self, lis, first_page=True, **kwargs): """循环请求""" for li in lis: if not li[1].xpath("div/p[@class='info']"): continue # app_name = self.get_app_name(li) # app_name = self.judge_null(app_name) # app_name = self.field_strip(app_name) # enter_url = self.get_enter_url(li) # 获取详情页url # inner_response = RqCompoent.get(enter_url, **self.add_headers) # fields = self.parse_app_info_page(inner_response) # to_sink = [app_name, *fields] # print(*to_sink) # sleep(self.delay_time) enter_url = self.get_enter_url(li) # 获取详情页url inner_response = RqCompoent.get(enter_url, **self.add_headers) if inner_response: self.inner_response = inner_response self.li = li app_name = self.get_app_name(li) app_name = self.judge_null(app_name) app_name = self.field_strip(app_name) img_address = self.get_img_address(li) app_intro = self.get_app_intro(inner_response) fields = self.parse_app_info_page(inner_response) to_sink = [self.name, app_name, *fields, img_address, app_intro] print(*to_sink[:-1], to_sink[-1][:20]) sleep(self.delay_time)
def get_app_list_elements(self, url) -> list: response = RqCompoent.post(self.url, data={}) big_dict = json.loads(response) html = big_dict.get("html").replace("=/", "=") selector = etree.HTML(html) lis = selector.xpath("//div[@class='cp-box clearfix']") return lis
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) self.outer_response = response selector = etree.HTML(response) lis = selector.xpath('//div[@class="m-solist"]//div') # print(lis) return lis
def loop_request(self, lis, first_page=True, **kwargs): """循环请求""" # for li in lis: # app_name = self.get_app_name(li) # app_name = self.judge_null(app_name) # app_name = self.field_strip(app_name) # enter_url = self.get_enter_url(li) # # 如果 enter_url 不是以 .html 结尾,不要 # if not enter_url.endswith(".html"): # continue # inner_response = RqCompoent.get(enter_url) # fields = self.parse_app_info_page(inner_response) # to_sink = [app_name, *fields] # print(*to_sink) # # sleep(self.delay_time) for li in lis: enter_url = self.get_enter_url(li) # 获取详情页url if not enter_url.endswith(".html"): continue inner_response = RqCompoent.get(enter_url, **self.add_headers) if inner_response: self.inner_response = inner_response self.li = li app_name = self.get_app_name(li) app_name = self.judge_null(app_name) app_name = self.field_strip(app_name) img_address = self.get_img_address(li) app_intro = self.get_app_intro(inner_response) fields = self.parse_app_info_page(inner_response) to_sink = [ self.name, app_name, *fields, img_address, app_intro ] print(*to_sink[:-1], to_sink[-1][:20])
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(self.url) self.outer_response = response selector = etree.HTML(response) lis = selector.xpath("//div[@class='result_box']//dl") # print(lis) return lis
def get_app_list_elements(self, url="http://s.5577.com") -> list: response = RqCompoent.get(self.url) self.outer_response = response selector = etree.HTML(response) lis = selector.xpath("//div[@class='g-left f-fl']//div[@class='m-cont-list']") print(lis) return lis
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) self.outer_response = response selector = etree.HTML(response) lis1 = selector.xpath('//dl[@id="result"]//dt') lis2 = selector.xpath('//dl[@id="result"]//dd') # print(lis) return list(zip(lis1, lis2))
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) # print(response) selector = etree.HTML(response) lis = selector.xpath( "//div[@class='main']/div[@class='SeaCon']/ul//li") # print(lis) return lis
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) response = re.findall( "jQuery191018405249964393477_1594619120728\((.*?)\);", response, re.S)[0] big_dict = json.loads(response) items = big_dict.get("data") return items
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) selector = etree.HTML(response) lis = selector.xpath("//ul[@class='MlistC']//li") extra = selector.xpath("//ul[@class='MlistD']//li") if extra: lis.append(extra[0]) return lis
def run(): keyword = "支付" driver = webdriver.Chrome() driver.get("https://www.wandoujia.com/") driver.maximize_window() driver.implicitly_wait(6) search_input = driver.find_element_by_xpath( '//input[@class="key-ipt"]') search_input.clear() search_input.send_keys(keyword) search_input.send_keys(Keys.ENTER) driver.implicitly_wait(6) # 切换到指定元素的位置 move_to = driver.find_element_by_partial_link_text("查看更多") ActionChains(driver).move_to_element(move_to).perform() move_to.click() time.sleep(2) driver.implicitly_wait(2) page_source = driver.page_source selector = etree.HTML(page_source) enter_urls = selector.xpath('//h2[@class="app-title-h2"]/a/@href') print(enter_urls) # time.sleep(3) driver.quit() for enter_url in enter_urls: response = RqCompoent.get(enter_url) selector = etree.HTML(response) app_name = judge_null( selector.xpath('//span[@class="title"]/text()')) download_url = judge_null( selector.xpath('//div[@class="download-wp"]/a[2]/@href')) update_time = judge_null( selector.xpath('//span[@class="update-time"]/text()')) if update_time: update_time = update_time.split(":")[1].strip() version = judge_null( re.findall("版本</dt><dd>(.*?)<", response, re.S)) if version: version = version.split(";")[1].strip() author = judge_null( re.findall("开发者</dt><dd><.*?>([\u4E00-\u9FA5]+)<", response, re.S)) img = judge_null( selector.xpath('//div[@class="app-icon"]/img/@src')) intro = judge_null( selector.xpath('string(//div[@class="desc-info"]/div/div)')) print(app_name, download_url, update_time, version, author, img, intro) # 清洗一下,换一下顺序即可 row = [ "豌豆荚", app_name, version, update_time, author, download_url, img, intro ]
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) # print(response) self.outer_response = response selector = etree.HTML(response) lis = selector.xpath( "//div[@id='center']/div[@id='results']//div[@class='result f s0']" ) return lis
def loop_request(self, items, first_page=True): """循环请求""" if not items: return # # 爬第一页的所有app信息 # if not self.match_keyword: for item in items: try: enter_url = self.get_enter_url(item) # 获取详情页url except: pass if not enter_url: continue app_name = None img_address = None app_intro = None try: inner_response = RqCompoent.get(enter_url, **self.add_headers) except: inner_response = None if inner_response: self.item = item self.inner_response = inner_response try: app_name = self.get_app_name(item) # 先获取一下app名字,对比关键字 except: pass # app_name = self.judge_null(app_name) app_name = self.field_strip(app_name) try: img_address = self.get_img_address(item) except: pass try: app_intro = self.get_app_intro(inner_response) except: pass fields = self.parse_app_info_page(inner_response, item) to_sink = [self.name, app_name, *fields, img_address, app_intro] res = [] for i in to_sink: if not i: res.append(None) else: res.append(pymysql.escape_string(i)) try: print(*res[:-1], res[-1][:20]) except: pass sql = "insert into spider_app(appStore, appName, version, updateTime, author,downloadUrl,icon, introduction, inList, platform, insertTime, keyword, enter_url) values(%s, %s, %s, %s, %s, %s, %s, %s, '否', '安卓', %s, %s, %s)" res = [*res, pymysql.escape_string(time.strftime("%Y/%m/%d", time.localtime())), pymysql.escape_string(self.keyword), pymysql.escape_string(enter_url)] try: self.cursor.execute(sql, res) except: pass sleep(self.delay_time) self.db.commit()
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) if not response: return [None, None, None, None] # big_dict = json.loads(response) big_dict = demjson.decode(response) items = list(zip(big_dict.get("SoftUrl"), big_dict.get("ResName"), \ big_dict.get("ResVer"), big_dict.get("UpdateTime"), big_dict.get("SmallImg"))) return items
def get_app_list_elements(self, url="http://zhiyingyong.com/search") -> list: response = RqCompoent.post(url, {"apptitle": self.keyword}, {"Referer": "http://zhiyingyong.com"}) self.outer_response = response # print(response) selector = etree.HTML(response) lis = selector.xpath( "//div[@class='content-categoryCtn']/div[@class='content-categoryCtn-content clearfix']//div[@class='app-max']" ) # print(lis) return lis
def get_app_list_elements(self, url) -> list: add_headers = {"referer": "https://m.pp.cn/search.html"} self.add_headers = add_headers response = RqCompoent.post(url, data={ "q": self.keyword, "page": "1" }, **add_headers) big_dict = json.loads(response) items = big_dict.get("data").get("content") return items
def get_app_list_elements(self, url) -> list: """ 获取搜索结果列表对应的元素 :return: 搜索结果元素列表 """ response = RqCompoent.get(url) if not response: return "error" self.outer_response = response selector = etree.HTML(response) lis = selector.xpath() return lis
def get_app_list_elements( self, url="https://www.52z.com/search?keyword=&s=1001&page=1&ajax=1" ) -> list: print(self.url) response = RqCompoent.get(self.url) # self.outer_response = response selector = etree.HTML(response) lis = selector.xpath("body//li") # print(lis) return lis
def get_app_list_elements( self, url="https://s.pc6.com/cse/search?s=12026392560237532321&entry=1&ie=gbk&q=" ) -> list: print(self.url) response = RqCompoent.get(self.url) # self.outer_response = response selector = etree.HTML(response) lis = selector.xpath( "body[@id='search']/div[@id='mbody']/div[@id='scont']/dl[@id='result']//dt" ) # print(lis) return lis
def get_app_list_elements(self, url) -> list: """ 获取搜索结果列表对应的元素 :return: 搜索结果元素列表 """ response = RqCompoent.get(url) big_dict = json.loads(response) print(big_dict) items = [] # selector = etree.HTML(response) # lis = selector.xpath() # return lis return items
def get_download_url(self, inner_response): """获取下载地址""" another_response = RqCompoent.get( "https://www.52z.com/soft/downview?id=" + self.app_id) download_pat = 'http://(.*?)\.apk' download_url = re.findall(download_pat, another_response, re.S) if download_url: download_url = download_url[0] download_url = "http://" + download_url[0] + ".apk" else: download_url = [] # selector = etree.HTML(inner_response) # download_url = selector.xpath("/html/body[@id='body']/div[@class='elywNei']/div[@class='elRight']/div[@class='elYxjsBox'][3]/div[@id='downajaxview']/div[@class='elYxxzIn'][1]/ul[@class='elYxxzList']/li[1]/a/@href") # download_url = self.judge_null(download_url) return download_url
def get_download_url(self, inner_response, li): """获取下载地址""" # app_id_pat = 'opendown\((.*?)\);" title="下载到电脑"' # app_id = re.findall(app_id_pat, inner_response) # download_url = li.xpath("a/@apkurl") # if download_url: # download_url = download_url[0] # download_url = "http://m.anzhuoapk.com" + download_url res = None try: res = RqCompoent.get("http://zhushou.sogou.com/apps/download.html?appid={}".format(self.app_id)) except: pass if not res: return None j = json.loads(res) download_url = j.get("data").get("file_url") return download_url
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) if response: big_dict = json.loads(response) items = big_dict.get("data") return items
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) selector = etree.HTML(response) lis = selector.xpath('//ul[@class="app-list"]//li') return lis
def loop_request(self, lis, first_page=True, **kwargs): """循环请求""" if not lis: return # for li in lis: # app_name = self.get_app_name(li) # 先获取一下app名字,对比关键字 # app_name = self.judge_null(app_name) # app_name = self.field_strip(app_name) # if first_page: # if self.keyword == app_name: # enter_url = self.get_enter_url(li) # 获取详情页url # inner_response = RqCompoent.get(enter_url) # fields = self.parse_app_info_page(inner_response) # to_sink = [app_name, *fields] # print(*to_sink) # self.match_keyword = True # break # else: # enter_url = self.get_enter_url(li) # 获取详情页url # inner_response = RqCompoent.get(enter_url) # fields = self.parse_app_info_page(inner_response) # to_sink = [app_name, *fields] # print(*to_sink) # sleep(self.delay_time) # # 爬第一页的所有app信息 # if not self.match_keyword: # for li in lis: # app_name = self.get_app_name(li) # 先获取一下app名字,对比关键字 # app_name = self.judge_null(app_name) # app_name = self.field_strip(app_name) # enter_url = self.get_enter_url(li) # 获取详情页url # inner_response = RqCompoent.get(enter_url) # fields = self.parse_app_info_page(inner_response) # to_sink = [app_name, *fields] # print(*to_sink) # sleep(self.delay_time) for li in lis: enter_url = None try: enter_url = self.get_enter_url(li) # 获取详情页url except: pass inner_response = RqCompoent.get(enter_url, **self.add_headers) if inner_response: self.inner_response = inner_response self.li = li app_name = None img_address = None app_intro = None try: app_name = self.get_app_name(li) # 先获取一下app名字,对比关键字 except: pass app_name = self.judge_null(app_name) app_name = self.field_strip(app_name) try: img_address = self.get_img_address(li) except: pass try: app_intro = self.get_app_intro(inner_response) except: pass fields = self.parse_app_info_page(inner_response) to_sink = [ self.name, app_name, *fields, img_address, app_intro ] res = [] for i in to_sink: if not i: res.append(None) else: res.append(pymysql.escape_string(i)) try: print(*res[:-1], res[-1][:20]) except: pass sql = "insert into spider_app(appStore, appName, version, updateTime, author,downloadUrl,icon, introduction, inList, platform, insertTime, keyword, enter_url) values(%s, %s, %s, %s, %s, %s, %s, %s, '否', '安卓', %s, %s, %s)" res = [ *res, pymysql.escape_string( time.strftime("%Y/%m/%d", time.localtime())), pymysql.escape_string(self.keyword), pymysql.escape_string(enter_url) ] try: self.cursor.execute(sql, res) except: pass sleep(self.delay_time) self.db.commit()
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) self.outer_response = response selector = etree.HTML(response) lis = selector.xpath("//div[@class='listCont']/ul//li") return lis
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) selector = etree.HTML(response) lis = selector.xpath("//div[@class='app_list border_three']/ul//li") return lis
def get_app_list_elements(self, url) -> list: response = RqCompoent.get(url) selector = etree.HTML(response) lis = selector.xpath("//div[@class='main-con']/div[@class='applist-wrap']/ul[@class='applist']//li") return lis
def temp_request(self, enter_url, method="get", data={}): if method == "get": inner_response = RqCompoent.get(enter_url, **self.add_headers) else: inner_response = RqCompoent.post( enter_url, data, **self.add_headers)