def news(): # 大姨妈 insert( script.capture( 'https://news.meiyou.com/news-api/v2/web_news_more?category_id=16' ), 1) # 备孕 insert( script.capture( 'https://news.meiyou.com/news-api/v2/web_news_more?category_id=18' ), 2) # 育儿 insert( script.capture( 'https://news.meiyou.com/news-api/v2/web_news_more?category_id=19' ), 3) # 美妆 insert( script.capture( 'https://news.meiyou.com/news-api/v2/web_news_more?category_id=8'), 4) # 健康 insert( script.capture( 'https://news.meiyou.com/news-api/v2/web_news_more?category_id=15' ), 5)
def detail(id): if id == None: print 'url 不能为 nil' return content = script.capture('http://www.dayima.com/articles/article/' + id) if content == "FAIL" or content is None: return '内容抓取失败' soup = BeautifulSoup(content) info = soup.find('div', class_='leftArea') if info == None: return script.error(id) titleTag = info.find('div', class_='artilce_title') newsContent = info.find('div', class_='article_content') newsContent = unicode(newsContent).replace("<br/>", " ") bref = unicode(info.find('div', class_='article_brief')).replace("<br/>", " ") title = titleTag.string time = info.find('span', class_='artilce_time') sources = u'大姨妈' return script.insert_detail(id, title, bref + newsContent, sources, time.string)
def detail(id): if id is None: print 'url 不能为 nil' return content = script.capture('https://news.meiyou.com/news_detail?news_id=' + str(id)) content = content.replace('</html>', '') if content == "FAIL" or content is None: return '内容抓取失败' soup = BeautifulSoup(content) info = soup.find('div', class_='warp') if info == None: return script.error(id) warp_title = info.find('div', class_='warp-title') news_content = info.find('div', class_='news-content') news_content = unicode(news_content).replace("<br/>", "") title = warp_title.h2 time = warp_title.find('span', class_='n-time') sources = warp_title.findAll('span') return script.insert_detail(id, title.string, news_content, sources[-1].string, time.string)
def detail(id): try: content = script.capture('https://www.yidianzixun.com/article/' + id) soup = BeautifulSoup(content) wrapperTag = soup.find('div', class_='left-wrapper') if wrapperTag == None: return script.error(id) titleTag = wrapperTag.h2 if titleTag == None: return script.error(id) metaTag = wrapperTag.find('div', class_='meta') sourceTag = metaTag.a timeTag = metaTag.span detail = wrapperTag.find('div', class_='content-bd') if detail == None: detail = wrapperTag.find('div', class_='video-wrapper') return script.insert_detail(id, titleTag.string, detail, sourceTag.string, timeTag.string) except: return None
def insert(url, type, cookie=''): headers = { 'Referer': 'https://www.yidianzixun.com/channel/e136117', 'cookie': 'JSESSIONID=5b833bbd91a7574bbfed3af92d4b4817966f7198e7c2845b95639ad32dc64e3c;', 'content-type': 'application/json; charset=utf-8' } try: content = script.capture(url, headers) content = json.loads(content) except: print url + ": 解析失败" return if not content.has_key('result'): return ids = [] for news in content['result']: if news.has_key('content_type' ) is not True or news['content_type'] is not 'news': continue news_id = news['itemid'] title = news['title'].replace("\n", "") title = title.strip() sourceName = u'一点资讯' author = news['source'] summary = news['summary'] ico = '' if news.has_key('wemedia_info') and news['wemedia_info'].has_key( 'image'): ico = news['wemedia_info']['image'] imgs = [] if news.has_key('image_urls'): for img in news['image_urls']: imgs.append( 'https://i1.go2yd.com/image.php?type=thumbnail_336x216&url=' + img) if title is None or news_id is None: continue script.insert_news(news_id, title, sourceName, SOURCE_HOST, author, 0, ico, type, imgs, summary) if detail(news_id) == 1: ids.append(news_id) print '--------------insert yidianzixun type:' + str( type) + ' count:' + str(len( content['result'])) + '-----------------------------------' script.appendIDs(ids)
def detail(id): if id is None: print 'url 不能为 nil' return content = script.capture('http://www.sohu.com/a/' + str(id)) if content == "FAIL" or content is None: print '内容抓取失败' return soup = BeautifulSoup(content) info = soup.find('div', class_='text') if info is None: return script.error(id) headerTag = info.find('div', class_='text-title') titles = headerTag.find('h1').stripped_strings for t in titles: title = t break author = u'搜狐新闻' try: sourceTag = headerTag.find('div', class_='article-info') timeTag = sourceTag.find('span', class_='time') authorsTag = sourceTag.find('span', class_='tag').findAll('a') if len(authorsTag) >= 0: author = authorsTag[-1].string except: pass try: news_content = info.find('div', class_='article') if news_content is None: news_content = info.find('article', class_='article') news_content.find('span', class_='backword').extract() news_content = unicode(news_content).replace("<br/>", "") return script.insert_detail(id, title, news_content, author, timeTag.string) except: return None
def news(): url = 'http://www.dayima.com/articles' content = script.capture(url) if content is "FAIL" or content is None: return 'invalid path' soup = BeautifulSoup(content) articles = soup.findAll('div', class_='dotted') ids = [] for article in articles: title = article.find('div', class_='title') aTag = title.a url = aTag.get('href') name = aTag.string query = url.split('/') news_id = query[-1] pic = article.find('div', class_='picArea') imgTag = pic.img read_count = 0 source_name = u'大姨妈' source_ico = '' script.insert_news(news_id, name, source_name, SOURCE_HOST, '', read_count, source_ico, 1, (imgTag['src'],)) if detail(news_id) == 1: ids.append(news_id) print '----------------------- insert dayima count: ' + str(len(articles)) + ' -----------------------------' script.appendIDs(ids)
def insert(url, type): content = script.capture(url) list = json.loads(content) ids = [] for item in list: title = item['title'].replace("\n", "") title = title.strip() authorURL = '' if item['authorPic'] is not None and len(item['authorPic']) > 10: authorURL = 'https:' + item['authorPic'] news_id = str(item['id']) + '_' + str(item['authorId']) script.insert_news(news_id, title, u'搜狐新闻', SOURCE_HOST, item['authorName'], 0, authorURL, type, item['images']) if detail(news_id) == 1: ids.append(news_id) print '----------------------- insert souhu type:' + str( type) + ' count:' + str(len(list)) + ' -----------------------------' script.appendIDs(ids)