def get_info(self, url): info_result = Crawler.Info() soup = self.get_soup(url) real_url = soup.find("meta").get("content") if real_url != "text/html; charset=UTF-8": url = real_url.replace("0; url=", "") info_result.url = url soup = self.get_soup(url) title = soup.find("h1") info_result.title = title.text time_source = soup.find("div", class_="infoLine") ts = time_source.text.replace("\r", "").replace("\n", "").split(" ") info_result.time = ts[0] info_result.source = ts[1] img = soup.find("img", id="no_img") if img is None: article = soup.find("div", class_="content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) else: info_result.description = img.get("src") return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url while True: soup = self.get_soup(url, opener) if soup.find("meta").get("content") == "text/html; charset=gb2312": js = soup.find("script").text fa = re.findall('self.location=".*?"', js)[0] url = indexUrl + fa[16:-1] else: break title = soup.find("h1", class_="fl") info_result.title = title.text time_source = soup.find("span", class_="fl") ts = time_source.text.split(" ") info_result.time = ts[0] info_result.source = ts[1] article = soup.find("dd") ats = article.find_all("p") text = "" if len(ats) == 0: texts = article.contents for tx in texts: if isinstance(tx, NavigableString): text = text + tx info_result.description = tx else: for at in ats: text = text + at.text.replace("\t", "").replace("\r", "") self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("div", class_="article_title") info_result.title = title.text info_result.time = soup.find("div", class_="article_date").text article = soup.find("div", class_="v_news_content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text.replace("\t", "") + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("div", class_="main02_tit01") info_result.title = title.text time_source = soup.find("div", class_="main02_tit02 black font_16") ts = time_source.text.split(" ") info_result.time = ts[1] info_result.source = ts[0] article = soup.find("div", class_="main02_con font_17") text = article.text.replace("\n\n\n\n\n", "\n").replace("\n\n\n", "\n").replace("\r", "") self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("div", class_="cms-article-tit") info_result.title = title.text time_source = soup.find("div", class_="cms-article-attach").find_all("font") info_result.time = time_source[1].text info_result.source = time_source[0].text article = soup.find("div", class_="article-detail") text = article.text self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() browser.get(url) info_result.url = url title = browser.find_element_by_class_name("title") info_result.title = title.text info_result.time = browser.find_element_by_id("edate").text info_result.source = browser.find_element_by_id("efrom").text article = browser.find_element_by_id("frameContent") ps = article.find_elements_by_tag_name("p") text = "" for p in ps: text = text + p.text.replace("\t", "") + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) detail_tit = soup.find("div", class_="detail_tit") info_result.title = detail_tit.find("h2").text info_result.time = detail_tit.find("em").text info_result.source = detail_tit.find("span").text article = soup.find("div", class_="detail_con") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url browser.get(url) time.sleep(0.5) title = browser.find_element_by_class_name("tit") info_result.title = title.text ts = browser.find_element_by_class_name( "bon").find_elements_by_tag_name("span") info_result.time = ts[1].text info_result.source = ts[0].text text = browser.find_element_by_class_name("TRS_Editor").text # text = article.find_element_by_tag_name("div"). self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) title_all = soup.find("div", class_="l-xq-tite bor-2e") info_result.title = title_all.find("h1").text info_result.time = title_all.find("label", class_="l-time").text info_result.source = title_all.find("label", class_="l-from").text article = soup.find("div", class_="l-content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("div", class_="xl_tit1 tblack1") info_result.title = title.text time_source = soup.find("div", class_="xl_tit2").text info_result.time = re.findall("发布时间: \d{4}-|/\d{1,2}-|/\d{1,2}", time_source)[0] info_result.source = re.findall("文章来源:.*", time_source)[0] article = soup.find("div", class_="xl_con1") ps = article.find_all("div") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) title = soup.find("div", class_="btnr").find("span") info_result.title = title.text time_source = soup.find("div", class_="btnr").find_all("p") info_result.time = time_source[1].text info_result.source = time_source[0].text article = soup.find("div", id="textBox") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h1") info_result.title = title.text time_source = soup.find("div", class_="infoBox").text info_result.time = re.findall("发布时间:\d{4}-|/\d{2}-|/\d{2} \d{2}:\d{2}", time_source)[0] info_result.source = re.findall("来源:.*? ", time_source)[0] article = soup.find("div", id="content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h3", class_="article") info_result.title = title.text time_source = soup.find("p", class_="laiyuan").find_all("span") info_result.time = time_source[0].text info_result.source = time_source[1].text article = soup.find("div", class_="content_txt") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h2") info_result.title = title.text time_source = soup.find("div", class_="head").find("p").find_all("span") info_result.time = time_source[1].text info_result.source = time_source[0].text article = soup.find("div", id="show3") ats = article.find_all("p") text = "" for at in ats: text = text + at.text self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url browser.get(url) title = browser.find_element_by_class_name("tit") info_result.title = title.text info_result.time = browser.find_element_by_xpath( '//em[@class="e e2"]').text info_result.source = browser.find_element_by_xpath( '//em[@class="e e1"]').text article = browser.find_element_by_class_name("TRS_Editor") ps = article.find_elements_by_tag_name("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) title = soup.find("h1", class_="ctitle") info_result.title = title.text time_source = soup.find("div", class_="msgbar").text ts = time_source.split(" ") info_result.time = ts[0] info_result.source = ts[1] article = soup.find("div", class_="newsCon") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) title = soup.find("h2", class_="tit") info_result.title = title.text time_source = soup.find("div", class_="daty text-center") ts = time_source.find_all("span") info_result.time = ts[2].text info_result.source = ts[0].text article = soup.find("div", class_="content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h1") info_result.title = title.text time_source = soup.find("div", class_="c-conten-top") ts = time_source.find_all("i") info_result.time = ts[0].text info_result.source = ts[2].text article = soup.find("div", class_="c-conten-con", id="c-conten-con") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h2", id="tts-title") info_result.title = title.text time_source = soup.find("p", class_="info") ts = time_source.find_all("span") info_result.time = ts[0].text info_result.source = ts[1].text article = soup.find("div", id="tts-text") ps = article.find_all("p") text = "" for p in ps: text = text + p.text.replace("\t", "") + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h2") info_result.title = title.text time_source = soup.find("div", class_="summary").text info_result.time = re.findall("浏览次数:.*?\n", time_source)[0].replace("浏览次数:", "") info_result.source = re.findall("来源:.*? ", time_source)[0] article = soup.find("div", id="Info_Content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("h1", class_="artTitle") info_result.title = title.text time = soup.find("span", id="pubtime_baidu") info_result.time = time.text.replace('\r', '').replace("\n", "") source = soup.find("span", id="source_baidu") info_result.source = source.text.replace('\r\n', '') article = soup.find("div", class_="artCon") ps = article.find_all("p") content = "" if len(ps) == 0: content = article.text info_result.description = content.replace("\n", "").replace("\r", "") return info_result flag = True resume_content = "" plen = len(ps) nu = 1 name = "" for p in ps: if flag: resume = p.find(text=re.compile("(^.*简历$)|(^.*简历:$)")) if resume is None: content = content + p.text + "\r\n" else: flag = False name = resume.replace(" ", "").replace(":", "").replace("简历", "").replace(" ", "").replace(" ", "") resume_content = resume_content + p.text + "\r\n" else: st = p.text.replace(" ", "") if st.startswith("1") or st.startswith("2") or st.startswith(name): if plen == nu and re.match("(.*?监委)", p.text) is not None: content = content + p.text + "\r\n" else: resume_content = resume_content + p.text + "\r\n" else: content = content + p.text + "\r\n" flag = True nu = nu + 1 info_result.description = content info_result.resume = resume_content return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url browser.get(url) time.sleep(0.5) title = browser.find_element_by_id("ScDetailTitle") info_result.title = title.text time_source = browser.find_element_by_class_name("desc") ts = time_source.text.split(" ") info_result.time = ts[1] info_result.source = ts[0] article = browser.find_element_by_id("ScDetailContent") ps = article.find_elements_by_tag_name("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) all_s = soup.find("div", class_="article") title = all_s.find("h2") info_result.title = title.text time_source = all_s.find("div", class_="fl") ts = time_source.text.replace("\t", "").split(" ") info_result.time = ts[0] info_result.source = ts[1] article = soup.find("div", class_="article_content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) all_ss = soup.find("div", class_="defaultTitle") title_all = all_ss.find("h2") info_result.title = title_all.text time_source = all_ss.find("div") ts = time_source.text.split("\n\t\t\t\t\t\t") info_result.time = ts[0] info_result.source = ts[1] article = soup.find("div", class_="defaultContent") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) all_ss = soup.find("div", id="text") title_all = all_ss.find("h1") info_result.title = title_all.text time_source = all_ss.find("div", class_="ls") ts = time_source.text info_result.time = re.findall("\d{4}-\d{2}-\d{2} \d{2}:\d{2}", ts)[0] info_result.source = re.findall("来源:.*? ", ts)[0] article = soup.find("div", class_="nr") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): # if url == "http://www.shjcw.gov.cn/2015jjw/n2233/u1ai72497.html": # print(1) info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url, opener) title = soup.find("div", id="ivs_title") info_result.title = title.text info_result.time = soup.find("span", class_="date").text source = soup.find("p", class_="source") if source is not None: info_result.source = source.text article = soup.find("div", id="ivs_content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): if url == "http://www.hnlzw.net/page.php?xuh=44175" or url == \ "http://www.hnlzw.net/page.php?xuh=39496": return None info_result = Crawler.Info() info_result.url = url soup = self.get_soup(url) title = soup.find("div", id="arttitl") info_result.title = title.text time_source = soup.find("div", id="artdes").text ts = time_source.split(" ") info_result.time = ts[0] info_result.source = ts[1] article = soup.find("div", id="artcon") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() try: soup = self.get_soup(url) except HTTPError as e: print("can't find " + url) print(e) return None info_result.url = url info_result.url = url title = soup.find("h1") info_result.title = title.text time_source = soup.find("div", class_="min_cc") ts = time_source.text.split("发布时间:") info_result.time = ts[1] info_result.source = ts[0] article = soup.find("div", class_="min_content") ps = article.find_all("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result
def get_info(self, url): info_result = Crawler.Info() info_result.url = url browser.get(url) time.sleep(0.5) title = browser.find_element_by_id("arttitl").text info_result.title = title time_source = browser.find_element_by_id("artdes").text tt = re.findall("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}", time_source) if len(tt) != 0: info_result.time = tt[0] source = re.findall("来源:.*?作者", time_source) if len(source) == 0: source = re.findall("来源:.*?编辑", time_source) if len(source) > 0: info_result.source = source[0] article = browser.find_element_by_id("artcon") ps = article.find_elements_by_tag_name("p") text = "" for p in ps: text = text + p.text + "\n" self.get_resum_description_from_text(text, info_result) return info_result