Exemplo n.º 1
0
 def url_parse(self,parsing_url):
     req=request.Request(parsing_url)
     req.add_header('user-agent',get_ua())
     #try:
     with request.urlopen(parsing_url) as f:
         data=f.read().decode('gbk')
         soup=BeautifulSoup(data,'lxml')
         return soup
Exemplo n.º 2
0
 def url_parse_topic(self,url_topic):
     req=request.Request(url_topic)
     req.add_header('user-agent',get_ua())
     with request.urlopen(req) as fpage:
         data=fpage.read().decode('gbk')
         soup_topic=BeautifulSoup(data,'lxml')
         topic_url=soup_topic.find_all('a',href=self.pattern_topic)
         topic_m=[]
         for item in topic_url:
             m1=re.findall(self.pattern_topic,str(item))
             topic_m.append(m1[0])
         return topic_m
Exemplo n.º 3
0
def url_img(url):
    req=request.Request(url)
    req.add_header('user-agent',get_ua())
    with request.urlopen(req) as imgpage:
        data=imgpage.read().decode('gbk')
        soup=BeautifulSoup(data,'lxml')
        img_url=soup.find_all('a',href=pattern_img)
        img_m=[]
        for item in img_url:
            m1=re.findall(pattern_img,str(item))
            img_m.append(m1[0])
        with open('img_url.txt','w') as fimg:
            fimg.write(str(img_m))
Exemplo n.º 4
0
 def url_parse_img(self,url):
     req=request.Request(url)
     print(url)
     req.add_header('user-agent',get_ua())
     with request.urlopen(req) as imgpage:
         data1=imgpage.read().decode('gbk')
         soup_img=BeautifulSoup(data1,'lxml')
         img_url=soup_img.find_all('a',href=self.pattern_img)
         img_m=[]
         for item in img_url:
             m2=re.findall(self.pattern_img,str(item))
             img_m.append(m2[0])
         return img_m
Exemplo n.º 5
0
def url_parse_topic(url):
    req = request.Request(url)
    req.add_header("user-agent", get_ua())
    with request.urlopen(req) as fpage:
        data = fpage.read().decode("gbk")
        soup = BeautifulSoup(data, "lxml")
        topic_url = soup.find_all("a", href=pattern_topic)
        topic_m = []
        for item in topic_url:
            m1 = re.findall(pattern_topic, str(item))
            # print(m1[0])
            topic_m.append(m1[0])
        # topic_url=soup.find_all(text=pattern)
        # print(topic_url)
        with open("jieguo.txt", "w") as f:
            f.write(str(topic_m))