Пример #1
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     if not response:
         return None
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='main']/div[@id='left']/ul[@class='app-list']//li")
     return lis
Пример #2
0
    def loop_request(self, lis, first_page=True, **kwargs):
        """循环请求"""
        for li in lis:
            if not li[1].xpath("div/p[@class='info']"):
                continue
            # app_name = self.get_app_name(li)
            # app_name = self.judge_null(app_name)
            # app_name = self.field_strip(app_name)
            # enter_url = self.get_enter_url(li)  # 获取详情页url
            # inner_response = RqCompoent.get(enter_url, **self.add_headers)
            # fields = self.parse_app_info_page(inner_response)
            # to_sink = [app_name, *fields]
            # print(*to_sink)
            # sleep(self.delay_time)
            enter_url = self.get_enter_url(li)  # 获取详情页url
            inner_response = RqCompoent.get(enter_url, **self.add_headers)
            if inner_response:
                self.inner_response = inner_response
                self.li = li

                app_name = self.get_app_name(li)
                app_name = self.judge_null(app_name)
                app_name = self.field_strip(app_name)

                img_address = self.get_img_address(li)
                app_intro = self.get_app_intro(inner_response)
                fields = self.parse_app_info_page(inner_response)
                to_sink = [self.name, app_name, *fields, img_address, app_intro]
                print(*to_sink[:-1], to_sink[-1][:20])
                sleep(self.delay_time)
Пример #3
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.post(self.url, data={})
     big_dict = json.loads(response)
     html = big_dict.get("html").replace("=/", "=")
     selector = etree.HTML(html)
     lis = selector.xpath("//div[@class='cp-box clearfix']")
     return lis
Пример #4
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath('//div[@class="m-solist"]//div')
     # print(lis)
     return lis
Пример #5
0
    def loop_request(self, lis, first_page=True, **kwargs):
        """循环请求"""
        # for li in lis:
        #     app_name = self.get_app_name(li)
        #     app_name = self.judge_null(app_name)
        #     app_name = self.field_strip(app_name)
        #     enter_url = self.get_enter_url(li)
        #     # 如果 enter_url 不是以 .html 结尾,不要
        #     if not enter_url.endswith(".html"):
        #         continue
        #     inner_response = RqCompoent.get(enter_url)
        #     fields = self.parse_app_info_page(inner_response)
        #     to_sink = [app_name, *fields]
        #     print(*to_sink)
        #     # sleep(self.delay_time)
        for li in lis:
            enter_url = self.get_enter_url(li)  # 获取详情页url
            if not enter_url.endswith(".html"):
                continue
            inner_response = RqCompoent.get(enter_url, **self.add_headers)
            if inner_response:
                self.inner_response = inner_response
                self.li = li

                app_name = self.get_app_name(li)
                app_name = self.judge_null(app_name)
                app_name = self.field_strip(app_name)

                img_address = self.get_img_address(li)
                app_intro = self.get_app_intro(inner_response)
                fields = self.parse_app_info_page(inner_response)
                to_sink = [
                    self.name, app_name, *fields, img_address, app_intro
                ]
                print(*to_sink[:-1], to_sink[-1][:20])
Пример #6
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(self.url)
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='result_box']//dl")
     # print(lis)
     return lis
Пример #7
0
 def get_app_list_elements(self, url="http://s.5577.com") -> list:
     response = RqCompoent.get(self.url)
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='g-left f-fl']//div[@class='m-cont-list']")
     print(lis)
     return lis
Пример #8
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     self.outer_response = response
     selector = etree.HTML(response)
     lis1 = selector.xpath('//dl[@id="result"]//dt')
     lis2 = selector.xpath('//dl[@id="result"]//dd')
     # print(lis)
     return list(zip(lis1, lis2))
Пример #9
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     # print(response)
     selector = etree.HTML(response)
     lis = selector.xpath(
         "//div[@class='main']/div[@class='SeaCon']/ul//li")
     # print(lis)
     return lis
Пример #10
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     response = re.findall(
         "jQuery191018405249964393477_1594619120728\((.*?)\);", response,
         re.S)[0]
     big_dict = json.loads(response)
     items = big_dict.get("data")
     return items
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     selector = etree.HTML(response)
     lis = selector.xpath("//ul[@class='MlistC']//li")
     extra = selector.xpath("//ul[@class='MlistD']//li")
     if extra:
         lis.append(extra[0])
     return lis
Пример #12
0
    def run():
        keyword = "支付"
        driver = webdriver.Chrome()
        driver.get("https://www.wandoujia.com/")
        driver.maximize_window()
        driver.implicitly_wait(6)

        search_input = driver.find_element_by_xpath(
            '//input[@class="key-ipt"]')
        search_input.clear()
        search_input.send_keys(keyword)
        search_input.send_keys(Keys.ENTER)
        driver.implicitly_wait(6)

        # 切换到指定元素的位置
        move_to = driver.find_element_by_partial_link_text("查看更多")
        ActionChains(driver).move_to_element(move_to).perform()
        move_to.click()
        time.sleep(2)
        driver.implicitly_wait(2)

        page_source = driver.page_source
        selector = etree.HTML(page_source)

        enter_urls = selector.xpath('//h2[@class="app-title-h2"]/a/@href')
        print(enter_urls)
        # time.sleep(3)
        driver.quit()
        for enter_url in enter_urls:
            response = RqCompoent.get(enter_url)
            selector = etree.HTML(response)
            app_name = judge_null(
                selector.xpath('//span[@class="title"]/text()'))
            download_url = judge_null(
                selector.xpath('//div[@class="download-wp"]/a[2]/@href'))
            update_time = judge_null(
                selector.xpath('//span[@class="update-time"]/text()'))
            if update_time:
                update_time = update_time.split(":")[1].strip()
            version = judge_null(
                re.findall("版本</dt><dd>(.*?)<", response, re.S))
            if version:
                version = version.split(";")[1].strip()
            author = judge_null(
                re.findall("开发者</dt><dd><.*?>([\u4E00-\u9FA5]+)<", response,
                           re.S))
            img = judge_null(
                selector.xpath('//div[@class="app-icon"]/img/@src'))
            intro = judge_null(
                selector.xpath('string(//div[@class="desc-info"]/div/div)'))
            print(app_name, download_url, update_time, version, author, img,
                  intro)

            # 清洗一下,换一下顺序即可
            row = [
                "豌豆荚", app_name, version, update_time, author, download_url,
                img, intro
            ]
Пример #13
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     # print(response)
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath(
         "//div[@id='center']/div[@id='results']//div[@class='result f s0']"
     )
     return lis
Пример #14
0
 def loop_request(self, items, first_page=True):
     """循环请求"""
     if not items:
         return
     # # 爬第一页的所有app信息
     # if not self.match_keyword:
     for item in items:
         try:
             enter_url = self.get_enter_url(item)  # 获取详情页url
         except:
             pass
         if not enter_url:
             continue
         app_name = None
         img_address = None
         app_intro = None
         try:
             inner_response = RqCompoent.get(enter_url, **self.add_headers)
         except:
             inner_response = None
         if inner_response:
             self.item = item
             self.inner_response = inner_response
             try:
                 app_name = self.get_app_name(item)  # 先获取一下app名字,对比关键字
             except:
                 pass
             # app_name = self.judge_null(app_name)
             app_name = self.field_strip(app_name)
             try:
                 img_address = self.get_img_address(item)
             except:
                 pass
             try:
                 app_intro = self.get_app_intro(inner_response)
             except:
                 pass
             fields = self.parse_app_info_page(inner_response, item)
             to_sink = [self.name, app_name, *fields, img_address, app_intro]
             res = []
             for i in to_sink:
                 if not i:
                     res.append(None)
                 else:
                     res.append(pymysql.escape_string(i))
             try:
                 print(*res[:-1], res[-1][:20])
             except:
                 pass
             sql = "insert into spider_app(appStore, appName, version, updateTime, author,downloadUrl,icon, introduction, inList, platform, insertTime, keyword, enter_url) values(%s, %s, %s, %s, %s, %s, %s, %s, '否', '安卓', %s, %s, %s)"
             res = [*res, pymysql.escape_string(time.strftime("%Y/%m/%d", time.localtime())), pymysql.escape_string(self.keyword), pymysql.escape_string(enter_url)]
             try:
                 self.cursor.execute(sql, res)
             except:
                 pass
             sleep(self.delay_time)
     self.db.commit()
Пример #15
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     if not response:
         return [None, None, None, None]
     # big_dict = json.loads(response)
     big_dict = demjson.decode(response)
     items = list(zip(big_dict.get("SoftUrl"), big_dict.get("ResName"), \
         big_dict.get("ResVer"), big_dict.get("UpdateTime"), big_dict.get("SmallImg")))
     return items
Пример #16
0
 def get_app_list_elements(self,
                           url="http://zhiyingyong.com/search") -> list:
     response = RqCompoent.post(url, {"apptitle": self.keyword},
                                {"Referer": "http://zhiyingyong.com"})
     self.outer_response = response
     # print(response)
     selector = etree.HTML(response)
     lis = selector.xpath(
         "//div[@class='content-categoryCtn']/div[@class='content-categoryCtn-content clearfix']//div[@class='app-max']"
     )
     # print(lis)
     return lis
Пример #17
0
 def get_app_list_elements(self, url) -> list:
     add_headers = {"referer": "https://m.pp.cn/search.html"}
     self.add_headers = add_headers
     response = RqCompoent.post(url,
                                data={
                                    "q": self.keyword,
                                    "page": "1"
                                },
                                **add_headers)
     big_dict = json.loads(response)
     items = big_dict.get("data").get("content")
     return items
Пример #18
0
 def get_app_list_elements(self, url) -> list:
     """
     获取搜索结果列表对应的元素
     :return: 搜索结果元素列表
     """
     response = RqCompoent.get(url)
     if not response:
         return "error"
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath()
     return lis
Пример #19
0
    def get_app_list_elements(
        self,
        url="https://www.52z.com/search?keyword=&s=1001&page=1&ajax=1"
    ) -> list:
        print(self.url)
        response = RqCompoent.get(self.url)

        # self.outer_response = response
        selector = etree.HTML(response)
        lis = selector.xpath("body//li")
        # print(lis)
        return lis
Пример #20
0
    def get_app_list_elements(
        self,
        url="https://s.pc6.com/cse/search?s=12026392560237532321&entry=1&ie=gbk&q="
    ) -> list:
        print(self.url)
        response = RqCompoent.get(self.url)

        # self.outer_response = response
        selector = etree.HTML(response)
        lis = selector.xpath(
            "body[@id='search']/div[@id='mbody']/div[@id='scont']/dl[@id='result']//dt"
        )
        # print(lis)
        return lis
Пример #21
0
    def get_app_list_elements(self, url) -> list:
        """
        获取搜索结果列表对应的元素
        :return: 搜索结果元素列表
        """
        response = RqCompoent.get(url)
        big_dict = json.loads(response)
        print(big_dict)
        items = []
        # selector = etree.HTML(response)
        # lis = selector.xpath()

        # return lis
        return items
Пример #22
0
 def get_download_url(self, inner_response):
     """获取下载地址"""
     another_response = RqCompoent.get(
         "https://www.52z.com/soft/downview?id=" + self.app_id)
     download_pat = 'http://(.*?)\.apk'
     download_url = re.findall(download_pat, another_response, re.S)
     if download_url:
         download_url = download_url[0]
         download_url = "http://" + download_url[0] + ".apk"
     else:
         download_url = []
     # selector = etree.HTML(inner_response)
     # download_url = selector.xpath("/html/body[@id='body']/div[@class='elywNei']/div[@class='elRight']/div[@class='elYxjsBox'][3]/div[@id='downajaxview']/div[@class='elYxxzIn'][1]/ul[@class='elYxxzList']/li[1]/a/@href")
     # download_url = self.judge_null(download_url)
     return download_url
Пример #23
0
 def get_download_url(self, inner_response, li):
     """获取下载地址"""
     # app_id_pat = 'opendown\((.*?)\);" title="下载到电脑"'
     # app_id = re.findall(app_id_pat, inner_response)
     # download_url = li.xpath("a/@apkurl")
     # if download_url:
     #     download_url = download_url[0]
     # download_url = "http://m.anzhuoapk.com" + download_url
     res = None
     try:
         res = RqCompoent.get("http://zhushou.sogou.com/apps/download.html?appid={}".format(self.app_id))
     except:
         pass
     if not res:
         return None
     j = json.loads(res)
     download_url = j.get("data").get("file_url")
     return download_url
Пример #24
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     if response:
         big_dict = json.loads(response)
         items = big_dict.get("data")
         return items
Пример #25
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     selector = etree.HTML(response)
     lis = selector.xpath('//ul[@class="app-list"]//li')
     return lis
Пример #26
0
    def loop_request(self, lis, first_page=True, **kwargs):
        """循环请求"""

        if not lis:
            return
        # for li in lis:
        #     app_name = self.get_app_name(li)  # 先获取一下app名字,对比关键字
        #     app_name = self.judge_null(app_name)
        #     app_name = self.field_strip(app_name)
        #     if first_page:
        #         if self.keyword == app_name:
        #             enter_url = self.get_enter_url(li)  # 获取详情页url
        #             inner_response = RqCompoent.get(enter_url)
        #             fields = self.parse_app_info_page(inner_response)
        #             to_sink = [app_name, *fields]
        #             print(*to_sink)
        #             self.match_keyword = True
        #             break
        #     else:
        #         enter_url = self.get_enter_url(li)  # 获取详情页url
        #         inner_response = RqCompoent.get(enter_url)
        #         fields = self.parse_app_info_page(inner_response)
        #         to_sink = [app_name, *fields]
        #         print(*to_sink)
        #     sleep(self.delay_time)

        # # 爬第一页的所有app信息
        # if not self.match_keyword:
        #     for li in lis:
        #         app_name = self.get_app_name(li)  # 先获取一下app名字,对比关键字
        #         app_name = self.judge_null(app_name)
        #         app_name = self.field_strip(app_name)
        #         enter_url = self.get_enter_url(li)  # 获取详情页url
        #         inner_response = RqCompoent.get(enter_url)
        #         fields = self.parse_app_info_page(inner_response)
        #         to_sink = [app_name, *fields]
        #         print(*to_sink)
        #         sleep(self.delay_time)
        for li in lis:
            enter_url = None
            try:
                enter_url = self.get_enter_url(li)  # 获取详情页url
            except:
                pass
            inner_response = RqCompoent.get(enter_url, **self.add_headers)
            if inner_response:
                self.inner_response = inner_response
                self.li = li

                app_name = None
                img_address = None
                app_intro = None
                try:
                    app_name = self.get_app_name(li)  # 先获取一下app名字,对比关键字
                except:
                    pass
                app_name = self.judge_null(app_name)
                app_name = self.field_strip(app_name)
                try:
                    img_address = self.get_img_address(li)
                except:
                    pass
                try:
                    app_intro = self.get_app_intro(inner_response)
                except:
                    pass
                fields = self.parse_app_info_page(inner_response)
                to_sink = [
                    self.name, app_name, *fields, img_address, app_intro
                ]
                res = []
                for i in to_sink:
                    if not i:
                        res.append(None)
                    else:
                        res.append(pymysql.escape_string(i))
                try:
                    print(*res[:-1], res[-1][:20])
                except:
                    pass
                sql = "insert into spider_app(appStore, appName, version, updateTime, author,downloadUrl,icon, introduction, inList, platform, insertTime, keyword, enter_url) values(%s, %s, %s, %s, %s, %s, %s, %s, '否', '安卓', %s, %s, %s)"
                res = [
                    *res,
                    pymysql.escape_string(
                        time.strftime("%Y/%m/%d", time.localtime())),
                    pymysql.escape_string(self.keyword),
                    pymysql.escape_string(enter_url)
                ]
                try:
                    self.cursor.execute(sql, res)
                except:
                    pass
                sleep(self.delay_time)
        self.db.commit()
Пример #27
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     self.outer_response = response
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='listCont']/ul//li")
     return lis
Пример #28
0
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='app_list border_three']/ul//li")
     return lis
 def get_app_list_elements(self, url) -> list:
     response = RqCompoent.get(url)
     selector = etree.HTML(response)
     lis = selector.xpath("//div[@class='main-con']/div[@class='applist-wrap']/ul[@class='applist']//li")
     return lis
Пример #30
0
 def temp_request(self, enter_url, method="get", data={}):
     if method == "get":
         inner_response = RqCompoent.get(enter_url, **self.add_headers)
     else:
         inner_response = RqCompoent.post(
             enter_url, data, **self.add_headers)