Exemplo n.º 1
0
 def get_detail(self, url, kid):
     path = "./templet_img/%s" % kid
     try:
         os.mkdir(path)
     except:
         print("文件已存在!")
         pass
     response = request_url(url=url,
                            headers=HEADERS,
                            method='get',
                            proxy=True)
     time.sleep(1)
     html = etree.HTML(response.text)
     cur_page = html.xpath(
         "//div[@class='pager']/span/label[@name='curpage']/text()"
     )[0].strip()
     tot_page = html.xpath(
         "//div[@class='pager']/span/label[@name='totalpage']/text()"
     )[0].strip()
     # 获取一个类别下的所有数据
     div_list = html.xpath("//dd[@class='temp_dl_dd']/div/div")
     for div in div_list:
         text = div.xpath("./div/div/text()")[0].strip()
         # 临时判断,这个特殊的图片加载不出来
         if text.strip() != "菲硕集团":
             code_img = div.xpath("./img/@src")[0].strip()
             code_img = parse.urljoin(self.state_url, code_img)
             img = request_url(url=code_img,
                               headers=HEADERS,
                               method='get',
                               proxy=True)
             if img.content:
                 time.sleep(0.5)
                 # 保存数据
                 with open(path + "/" + text + ".png", 'wb') as f:
                     f.write(img.content)
         else:
             pass
     # 翻页
     next_page = html.xpath(
         "//div[@class='pager']/a[@class='frame-btn page-next']/@href"
     )[0].strip()
     next_page = parse.urljoin(url, next_page)
     if cur_page >= tot_page:
         return
     else:
         self.get_detail(next_page, kid)
Exemplo n.º 2
0
 def get_info(self, path, url):
     time.sleep(1)
     response = request_url(url=url,
                            headers=HEADERS,
                            method='get',
                            proxy=True)
     html = etree.HTML(response.content.decode("utf-8"))
     li_list = html.xpath("//ul[@class='clearfix caselist']/li")
     for li in li_list:
         src = parse.urljoin(
             self.parse_url,
             li.xpath("./div[@class='casema']/img/@src")[0].strip())
         text = li.xpath("./div[@class='p15']/h2/text()")[0].strip()
         time.sleep(0.5)
         sresponse = request_url(url=src,
                                 headers=HEADERS,
                                 method='get',
                                 proxy=True)
         with open(path + "/" + text + ".jpg", "wb") as f:
             f.write(sresponse.content)
Exemplo n.º 3
0
 def save_msg(self, td, si, tl, path):
     # 保存详情信息
     with open(path + "/" + "info.txt", "w", encoding="utf-8") as f:
         f.write(str(td))
     # 保存详情图片
     time.sleep(0.5)
     iresponse = request_url(url=si,
                             headers=HEADERS,
                             method='get',
                             proxy=True)
     with open(path + "/" + "info.png", "wb") as f:
         f.write(iresponse.content)
     # 保存轮播图片
     for src in tl:
         time.sleep(0.5)
         text = re.split("/", src)[-1]
         if (text == "") or (text is None):
             continue
         sresponse = request_url(url=src,
                                 headers=HEADERS,
                                 method='get',
                                 proxy=True)
         with open(path + "/" + text, "wb") as f:
             f.write(sresponse.content)
Exemplo n.º 4
0
def index_top():
    # 获取top分类
    temp_dict = {}
    response = request_url(url=state_url,
                           headers=HEADERS,
                           method='get',
                           proxy=True)
    html = etree.HTML(response.text)
    top_text = html.xpath("//ul[@class='top_nav']/li/a/text()")
    for text in top_text:
        if text in TOP_MENU:
            href = html.xpath(
                "//ul[@class='top_nav']/li/a[text()='%s']/@href" % text)[0]
            href = parse.urljoin(state_url, href)
            temp_dict[text.strip()] = href
    return temp_dict
Exemplo n.º 5
0
 def get_menu_href(self):
     # 获取分类菜单的url
     response = request_url(url=self.it,
                            headers=HEADERS,
                            method="get",
                            proxy=True)
     html = etree.HTML(response.text)
     nav_text = html.xpath("//ul[@class='temp_dl_dt_ul']/a//text()")
     for text in nav_text:
         if text.strip() in list(set(NAV_MENU)):
             temp_dict = {}
             nav_href = html.xpath(
                 "//ul[@class='temp_dl_dt_ul']/a/li[text()='%s']/../@href" %
                 text.strip())[0].strip()
             nav_href = parse.urljoin(self.it, nav_href)
             temp_dict["url"] = nav_href
             temp_dict["text"] = text
             yield temp_dict
Exemplo n.º 6
0
 def get_top_nav(self):
     time.sleep(random.randint(1, 2))
     response = request_url(url=self.mfcase_index,
                            headers=HEADERS,
                            method='get',
                            proxy=True)
     html = etree.HTML(response.content.decode("utf-8"))
     for i in html.xpath("//div[@class='casebox']/div[1]/a/text()"):
         if i.strip() in MFCASENAV:
             path = "./case/" + i.strip()
             try:
                 os.mkdir(path)
             except:
                 print("%s文件已存在" % path)
             href = html.xpath(
                 "//div[@class='casebox']/div[1]/a[text()='%s']/@href" %
                 i)[0].strip()
             href = parse.urljoin(self.parse_url, href)
             self.get_kind_nav(path, href)
Exemplo n.º 7
0
 def get_kind_nav(self, path, href):
     time.sleep(random.randint(1, 2))
     response = request_url(url=href,
                            headers=HEADERS,
                            method='get',
                            proxy=True)
     html = etree.HTML(response.content.decode("utf-8"))
     if html.xpath("//div[@class='caseSubNav']"):
         a_list = html.xpath("//div[@class='caseSubNav']/div/a")[1:]
         for a in a_list:
             text = re.sub("\(\d{1,3}\)", "",
                           a.xpath("./text()")[0].strip())
             kpath = path + "/" + text
             try:
                 os.mkdir(kpath)
             except:
                 print("%s文件已存在" % kpath)
             url = "http://www.duoguan.com/e/tags/?tagname=" + text
             self.get_info(kpath, url)
     else:
         self.get_info(path, href)
Exemplo n.º 8
0
 def get_menu(self):
     time.sleep(random.randint(1, 2))
     response = request_url(url=self.module_menu,
                            headers=HEADERS,
                            method='get',
                            proxy=True)
     html = etree.HTML(response.content.decode("utf-8"))
     div_list = html.xpath("//div[@class='ModuleMin']/div[position()>1]")
     for div in div_list:
         for kind in div.xpath("./h3/text()"):
             if kind in HOMENAVMIN:
                 path = "./template/%s" % kind.strip()
                 try:
                     os.mkdir(path)
                 except:
                     print("%s文件已存在" % path)
                 if kind == "商家助手":
                     pass
                 else:
                     dd_list = div.xpath(
                         "//h3[text()='%s']/following-sibling::*[1]/dd" %
                         kind)
                     self.get_module_info(dd_list, path)
Exemplo n.º 9
0
 def get_module_info(self, ddl, path):
     path1 = path
     for dd in ddl:
         temp_dict = {}
         temp_list = list()
         text = dd.xpath("./a/text()")[0].strip()
         path2 = path1 + "/" + text
         try:
             os.mkdir(path2)
         except:
             print("%s文件已存在" % path2)
         href = dd.xpath("./a/@href")[0].strip()
         href = parse.urljoin(self.module_menu, href)
         time.sleep(random.randint(1, 2))
         response = request_url(url=href,
                                headers=HEADERS,
                                method='get',
                                proxy=True)
         html = etree.HTML(response.content.decode("utf-8"))
         # 获取详情介绍
         try:
             temp_dict["详情描述"] = html.xpath(
                 "//div[@class='right']/p/text()")[0].strip()
         except:
             temp_dict["详情描述"] = ""
         try:
             temp_dict["应用场景"] = html.xpath(
                 "//div[@class='right']/div[1]/div[1]/span/text()"
             )[0].strip()
         except:
             temp_dict["应用场景"] = ""
         try:
             temp_dict["模块搭配"] = html.xpath(
                 "//div[@class='right']/div[1]/div[2]/span/text()"
             )[0].strip()
         except:
             temp_dict["模块搭配"] = ""
         # 用异常匹配所有的图片
         try:
             src_info = html.xpath(
                 "//div[@class='Details-info']/p[2]/span/img/@src"
             )[0].strip()
         except:
             try:
                 src_info = html.xpath(
                     "//div[@class='Details-info']/p[last()]/span/img/@src"
                 )[0].strip()
             except:
                 try:
                     src_info = html.xpath(
                         "//div[@class='Details-info']/p[2]/img/@src"
                     )[0].strip()
                 except:
                     try:
                         src_info = html.xpath(
                             "//div[@class='Details-info']/p[last()]/img/@src"
                         )[0].strip()
                     except:
                         src_info = html.xpath(
                             "//div[@class='Details-info']/div[last()]/img/@src"
                         )[0].strip()
         src_info = parse.urljoin(self.module_menu, src_info)
         src_slider = html.xpath(
             "//div[@class='swiper-container']/div[1]/div/a/@style")
         for src in src_slider:
             src = re.search("background-image: url\((.*)\)", src).group(1)
             src = parse.urljoin(self.module_menu, src)
             temp_list.append(src)
         self.save_msg(temp_dict, src_info, temp_list, path2)