def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' url = response.url day = response.meta['day'] title = response.meta['title'] _type = response.meta['_type'] response = response.body soup = BeautifulSoup(response) try: items_keywords = soup.find_all(class_='hotword') for i in range(0, len(items_keywords)): keywords += items_keywords[i].text.strip() + ' ' except: log.msg("News " + title + " dont has keywords!", level=log.INFO) try: article = soup.find(id='newscontent').text.strip() except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = u'证券日报网' return item
def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' #直接解析meta url = response.url day = response.meta['day'] title = response.meta['title'] _type = response.meta['_type'] response = response.body soup = BeautifulSoup(response) #找到文章 try: # article = soup.find(class_='postTitle').text.strip() article = soup.find(id='news_body').text.strip() except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = u'博客园' return item
def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' url = response.url day = response.meta['day'] title = response.meta['title'] _type = response.meta['_type'] response = response.body soup = BeautifulSoup(response) # try: # items_keywords = soup.find(class_='ar_keywords').find_all('a') # for i in range(0, len(items_keywords)): # keywords += items_keywords[i].text.strip() + ' ' # except: # log.msg("News " + title + " dont has keywords!", level=log.INFO) try: ################################## # 分情况获取储存新闻内容的标签 # "码云推荐",获取项目简介(通常是 README.md 文档内容) if re.search("translate",url): article = soup.find_all("div",class_ = "translate-content") markdown = "".join(str(article)) markdown = Tomd(markdown).markdown article = [tag.text.strip() for tag in article] article = ''.join(article) else: if re.match("https://gitee.com",url): article = soup.find("div",class_="file_content markdown-body")# CSS选择器:#git-readme > div > div.file_content.markdown-body # "码云周刊" elif re.match("https://blog.gitee.com",url): article = soup.find("div",class_="entry-content") elif re.search("translate",url): article = soup.find_all("div",class_ = "translate-content") # 其他常见页面 elif soup.find("div",class_= ["content","box-aw main"]): article = soup.find("div",class_= ["content","box-aw main"]) else: article = soup.find("section",class_= ["wrap cke_editable cke_editable_themed cke_contents_ltr cke_show_borders clearfix"]) if article and not article.find("div",class_="ad-wrap")==None: article.find("div",class_="ad-wrap").extract() markdown = markdown = Tomd(str(article)).markdown article = article.text.strip() #提取标签文本 except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = '开源中国' item['markdown'] = markdown return item
def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' url = response.url day = response.meta['day'] title = response.meta['title'] response = response.body soup = BeautifulSoup(response) try: article = soup.find(id='ozoom').text.strip() except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = u'证券日报' return item
def parse_news(self, response): log.msg("Start to parse news " + response.url, level=log.INFO) item = SpiderNewsAllItem() day = title = _type = keywords = url = article = '' url = response.url day = response.meta['day'] title = response.meta['title'] _type = response.meta['_type'] response = response.body soup = BeautifulSoup(response) # try: # items_keywords = soup.find(class_='ar_keywords').find_all('a') # for i in range(0, len(items_keywords)): # keywords += items_keywords[i].text.strip() + ' ' # except: # log.msg("News " + title + " dont has keywords!", level=log.INFO) try: content_paragraph = soup.find("div", class_="text_info") article = [] for tag in content_paragraph.find( "div", class_="clear").previous_siblings: article.insert(0, tag) markdown = Tomd(''.join( str(article))).markdown.decode("unicode-escape") article = BeautifulSoup(''.join([str(tag) for tag in article ])).get_text().strip() except: log.msg("News " + title + " dont has article!", level=log.INFO) item['title'] = title item['day'] = day item['_type'] = _type item['url'] = url item['keywords'] = keywords item['article'] = article item['site'] = 'InfoQ' item['markdown'] = markdown return item