def parse(self, response): try: jsonObj = json.loads(response.text) print('抓取新闻数目', len(jsonObj['data'])) for data in jsonObj['data']: jsonNowObj = json.loads(data['content']) print(jsonNowObj['title']) missionBean = MissionBean(response.url, 0, ['train_rank']) missionBean.title = jsonNowObj['title'] missionBean.info = jsonNowObj missionBean.info.update({ 'news_type': '头条热点流', 'resource': '头条推荐流' }) if missionBean.info.get('tag') == 'ad': print('这是个广告,过滤') continue self.client.save(missionBean) finally: ipDict = getRandomOneIP() yield Request(url=response.url, headers=self.headers, dont_filter=True, meta={ 'proxy': 'http://' + ipDict['ip'] + ':' + ipDict['port'] })
def parse_item(self, response): i = 0 bodys = response.xpath('//table[@class="list-table"]/tr') for body in bodys: if body.xpath('.//td[@class="first"]').extract(): items = {} num = body.xpath( './/td[@class="first"]/span/text()').extract_first() title = body.xpath( './/td[@class="keyword"]/a/text()').extract_first() href = body.xpath( './/td[@class="keyword"]/a/@href').extract_first() focus_num = body.xpath( './/td[@class="last"]/span/text()').extract_first() items['index'] = num items['title'] = title items['news_type'] = '百度' + response.meta['news_type'] items['url'] = href items['num'] = int(focus_num) items['focus_num'] = focus_num items['resource'] = '百度' print(items) i = i + 1 try: missionBean = MissionBean(href, 500, ['train_hotword']) missionBean.title = title missionBean.info = items self.client.save(missionBean) except: print("存储数据库出现异常") traceback.print_exc() print('本次抓取个数{}'.format(i)) self.sleepMyself()
def __waitforImgs(self, urlImgList, newsId): """ 对一组图片进行下载,只要有一个失败,就报异常 :param urlImgList: :param newsId: :return: """ urlImgListAfterFilter = [] for img in urlImgList: if img[0:2] == '//': img = 'http:' + img urlImgListAfterFilter.append(img) missionBean = MissionBean(img, 7000, []) missionBean.isFileTag = True missionBean.downloadCallback = 'set' redisDownloadSaveDb3( str(self.DEEP_DOWNLOAD) + '_' + self.FILE_KEY_DOWNLOAD, missionBean.getRedisDict()) for url in urlImgListAfterFilter: timeStart = datetime.datetime.now().timestamp() while True: # 下载超时30秒 if datetime.datetime.now().timestamp() - timeStart > 30: raise Exception('图片下载超时') msgStr = redisGet( 3, self.FILE_KEY_DOWNLOAD + '_callback_' + str(self.TYPE_DOWNLOAD) + ':' + url) if msgStr is None: sleep(1) continue else: dictMsg = json.loads(msgStr) print('图片下载成功,正在存储') dictInsert = {} fileUrl = dictMsg['fileUrl'] fileSome = fileUrl.split('/') urlSome = img.split('/') lstTag = ['.jpg', '.jpeg', '.gif', '.bmp', '.png'] for tag in lstTag: if tag in fileUrl: dictInsert['fileUrl'] = 'testFileName' + tag break dictInsert['fileUrl'] = 'testFileName.jpg' dictInsert['Uploaded size'] = dictMsg['info']['file'][ 'size'] dictInsert['Storage IP'] = fileSome[2] dictInsert['Remote file_id'] = fileUrl.split(fileSome[2] + '/')[1] dictInsert['imgUrl'] = dictMsg['url'] dictInsert['imgName'] = urlSome[len(urlSome) - 1] dictInsert['Group name'] = fileSome[3] dictInsert['articleId'] = newsId self.new_db['d_news_images'].save(dictInsert) break
def parse_item(self, response): info = response.request.info html = response.body.decode() match = self.get_addr(html) if len(match) > 0: info['videoUrl'] = match[0] else: return bs4 = BeautifulSoup(response.text, 'html.parser') info['img'] = bs4.select_one("div[id=\"poster\"]").select_one('img')['src'] missionBean = MissionBean(response.url, 3, ['fishing_new']) missionBean.html = html missionBean.title = info['title'] missionBean.info = info self.client.save(missionBean)
def __getMissionBeanFromRedis(self, requestKeys): random.shuffle(requestKeys) # 随机乱序 for redisKey in requestKeys: keyNow = redisGetBiggetDeepKey(3, redisKey) if keyNow is None: print(redisKey + '_redis 无任务(不存在任何key)') time.sleep(1) return None strMissionBean = json.loads(redisRPop(3, keyNow)) # 下载队列为空 if strMissionBean is None: print(redisKey + '_redis 无任务(key中不存在missionBean)') return None missionBean = MissionBean("", 0, []) missionBean.__dict__ = strMissionBean missionBean.downloadMethod = redisKey return missionBean
def parse_item(self, response): info = response.request.info html = response.text bs4 = BeautifulSoup(html, "html.parser") content = bs4.select_one('div[class=\"content\"]').prettify() info['content'] = content missionBean = MissionBean(response.url, 1001, ['qutoutiao']) missionBean.info = info missionBean.html = html missionBean.title = info['title'] # 组装正式版Bean newsBean = NewsBean() newsBean.titleInfo = info['title'] newsBean.content = info['content'] newsBean.url = response.url newsBean.newsId = info['id'] newsBean.tags = info['tag'] newsBean.etc = {'news_type': info['type']} newsBean.fromChannel = self.TYPE_DICT.get(int(info['type']), '其他') newsBean.fromSpider = '推荐流' newsBean.fromType = 8 newsBean.goodNum = int(info['like_num']) newsBean.commentNum = int(info['comment_count']) newsBean.readNum = int(info['read_count']) newsBean.mediaName = info['source_name'] newsBean.mediaId = info['source_name'] newsBean.introduction = info['introduction'] newsBean.imgUrls = info['cover'] newsBean.shareNum = info['share_count'] missionBean.info = newsBean.__dict__ # 其中publishDate和createTime由于redis的格式问题 # TODO 只能传递时间戳 newsBean.publishDate = datetime.datetime.fromtimestamp( int(info['publish_time']) / 1000).timestamp() newsBean.createTime = newsBean.createTime.timestamp() daoFilterAndSave.MongoFilterSave(missionBean)
from zywa_database_core.dao.mongo.mongoClientMyself import MongoClientMyself from zywa_extract_helper.model.missionBean import MissionBean if __name__ == '__main__': __mongoClient = MongoClientMyself(host="172.10.3.219", port=20000, db="xiaociwei", user="******", password="******") items = __mongoClient.selectAll(tableName='iqiyi_video') i = 0 for item in items: missionBean = MissionBean('', 0, []) missionBean.__dict__ = item print(i) print(missionBean.title) redisLPush(4, 'data_clear_' + str(missionBean.type), missionBean.getRedisDict()) i += 1
def parse(self, response): try: if 'top.baidu' in response.url: modes = response.xpath( '//div[@class="hblock"]/ul/li/a/@href').extract() for mode in modes[1:]: news_type = response.xpath( '//div[@class="hblock"]/ul/li[{}]/a/@title'.format( str(1 + modes.index(mode)))).extract_first() yield Request(url=self.baidu_mainurl + mode[1:], callback=self.parse_item, dont_filter=True, meta={'news_type': news_type}, priority=2) if 'weibo' in response.url: rhtml = response.xpath('//script/text()').extract( ) # 变量瞎定义的,大家将就着看,获取整个页面的script的字符串信息。 htm = rhtml[8] # 获取目标ID为realtimehot的Table的脚本信息,为什么是8呢?我在页面数的。 start = htm.find("(") substr = htm[start + 1:-1] # 截取脚本里面的json串信息。 html = json.loads(substr)['html'] bs4 = BeautifulSoup(html, 'html.parser') trTags = bs4.select('tr[action-type=\"hover\"]') print("发现潜在词数量", len(trTags)) for trTag in trTags: dictInfo = {} dictInfo['index'] = trTag.find('em').string dictInfo['title'] = trTag.find('p', class_='star_name').a.string dictInfo['url'] = trTag.find( 'p', class_='star_name').a.get('href') dictInfo['resource'] = '微博' try: dictInfo['num'] = int( trTag.find('p', class_='star_num').span.string) except: dictInfo['num'] = -1 missionBean = MissionBean(dictInfo['url'], 501, ['train_hotword']) missionBean.title = str(dictInfo['title']) if 'realtimehot' in response.url: missionBean.info = {'news_type': '微博热搜'} if 'socialevent' in response.url: missionBean.info = {'news_type': '微博新时代'} missionBean.info.update(dictInfo) print(missionBean.title) self.client.save(missionBean) if 'news.163' in response.url: typeName0 = '163' bs4 = BeautifulSoup(response.text, "html.parser") items = bs4.select_one("div[class=\"area areabg1\"]") i = 0 for titleBarTag in items.select("div[class=\"titleBar\"]"): # 这个网站比较奇怪,是并列关系,第n个titleBar对应第n个left和right typeName1 = titleBarTag.select_one("h2").get_text() # 分类名 """ 左侧分类【点击榜】 """ areaLeftTag = items.select( 'div[class=\"area-half left\"]')[i] typeName2 = areaLeftTag.select_one("h2").get_text() liTags = items.select( "div[class=\"title-tab\"]")[i].select('li') j = 0 for li in liTags: typeName3 = li.get_text() # print(str(areaLeftTag)) tableTag = areaLeftTag.select('table')[j] for newsTag in tableTag.select("tr"): # 标题行不抓取 if "标题" in newsTag.get_text(): continue infoDict = {} infoDict['title'] = newsTag.select_one( 'a').get_text() infoDict['index'] = int( newsTag.select("td")[0].select_one( "span").get_text()) infoDict['num'] = int( newsTag.select("td")[1].get_text()) infoDict['upOrDown'] = -1 infoDict['url'] = newsTag.select_one('a')['href'] infoDict[ 'news_type'] = typeName0 + typeName1 + typeName2 + typeName3 infoDict['resource'] = '163' missionBean = MissionBean(response.url, 510, ['train_hotword']) missionBean.info = infoDict self.client.save(missionBean) j = j + 1 """ 右侧分类【跟帖榜】 """ areaLeftTag = items.select( 'div[class=\"area-half right\"]')[i] typeName2 = areaLeftTag.select_one("h2").get_text() liTags = items.select( "div[class=\"title-tab\"]")[i].select('li') j = 0 for li in liTags: typeName3 = li.get_text() # print(str(areaLeftTag)) tableTag = areaLeftTag.select('table')[j] for newsTag in tableTag.select("tr"): # 标题行不抓取 if "标题" in newsTag.get_text(): continue infoDict = {} infoDict['title'] = newsTag.select_one( 'a').get_text() infoDict['index'] = int( newsTag.select("td")[0].select_one( "span").get_text()) infoDict['num'] = int( newsTag.select("td")[1].get_text()) infoDict['upOrDown'] = -1 infoDict['url'] = newsTag.select_one('a')['href'] infoDict[ 'news_type'] = typeName0 + typeName1 + typeName2 + typeName3 infoDict['resource'] = '163' missionBean = MissionBean(response.url, 511, ['train_hotword']) missionBean.info = infoDict self.client.save(missionBean) j = j + 1 i = i + 1 if 'api.1sapp' in response.url: jsonBean = json.loads(response.text) print(jsonBean) for i, news in enumerate(jsonBean['data']['data']): items = {} parseUrl = urlparse(response.url) strParseQs = parseUrl[4] res = parse.parse_qs(strParseQs) pageNum = int(res.get('page')[0]) limitNum = int(res.get('limit')[0]) items['index'] = i + 1 + (pageNum - 1) * limitNum items['title'] = news['title'] items['news_type'] = '趣头条推荐流' items['url'] = news['url'] items['num'] = 10000 - items['index'] items['num1'] = int(news['read_count']) items['num2'] = int(news['share_count']) items['resource'] = '趣头条' missionBean = MissionBean(items['url'], 500, ['train_hotword']) missionBean.title = items['title'] missionBean.info = items self.client.save(missionBean) except: traceback.print_exc() finally: print("正在添加新任务至队列头部") request = Request(url=response.url, dont_filter=True) yield request