def url_parse(self,parsing_url): req=request.Request(parsing_url) req.add_header('user-agent',get_ua()) #try: with request.urlopen(parsing_url) as f: data=f.read().decode('gbk') soup=BeautifulSoup(data,'lxml') return soup
def url_parse_topic(self,url_topic): req=request.Request(url_topic) req.add_header('user-agent',get_ua()) with request.urlopen(req) as fpage: data=fpage.read().decode('gbk') soup_topic=BeautifulSoup(data,'lxml') topic_url=soup_topic.find_all('a',href=self.pattern_topic) topic_m=[] for item in topic_url: m1=re.findall(self.pattern_topic,str(item)) topic_m.append(m1[0]) return topic_m
def url_img(url): req=request.Request(url) req.add_header('user-agent',get_ua()) with request.urlopen(req) as imgpage: data=imgpage.read().decode('gbk') soup=BeautifulSoup(data,'lxml') img_url=soup.find_all('a',href=pattern_img) img_m=[] for item in img_url: m1=re.findall(pattern_img,str(item)) img_m.append(m1[0]) with open('img_url.txt','w') as fimg: fimg.write(str(img_m))
def url_parse_img(self,url): req=request.Request(url) print(url) req.add_header('user-agent',get_ua()) with request.urlopen(req) as imgpage: data1=imgpage.read().decode('gbk') soup_img=BeautifulSoup(data1,'lxml') img_url=soup_img.find_all('a',href=self.pattern_img) img_m=[] for item in img_url: m2=re.findall(self.pattern_img,str(item)) img_m.append(m2[0]) return img_m
def url_parse_topic(url): req = request.Request(url) req.add_header("user-agent", get_ua()) with request.urlopen(req) as fpage: data = fpage.read().decode("gbk") soup = BeautifulSoup(data, "lxml") topic_url = soup.find_all("a", href=pattern_topic) topic_m = [] for item in topic_url: m1 = re.findall(pattern_topic, str(item)) # print(m1[0]) topic_m.append(m1[0]) # topic_url=soup.find_all(text=pattern) # print(topic_url) with open("jieguo.txt", "w") as f: f.write(str(topic_m))