Python MissionBean.info 예제들, zywa_extract_helper.model.missionBean.MissionBean.info Python 예제들

예제 #1

0

파일 보기

파일: rank.py 프로젝트: hedgehogBoby/beyebe-spider-hedgehogBoby

    def parse(self, response):
        try:
            jsonObj = json.loads(response.text)
            print('抓取新闻数目', len(jsonObj['data']))
            for data in jsonObj['data']:
                jsonNowObj = json.loads(data['content'])
                print(jsonNowObj['title'])
                missionBean = MissionBean(response.url, 0, ['train_rank'])
                missionBean.title = jsonNowObj['title']
                missionBean.info = jsonNowObj
                missionBean.info.update({
                    'news_type': '头条热点流',
                    'resource': '头条推荐流'
                })
                if missionBean.info.get('tag') == 'ad':
                    print('这是个广告,过滤')
                    continue

                self.client.save(missionBean)
        finally:
            ipDict = getRandomOneIP()
            yield Request(url=response.url,
                          headers=self.headers,
                          dont_filter=True,
                          meta={
                              'proxy':
                              'http://' + ipDict['ip'] + ':' + ipDict['port']
                          })

예제 #2

0

파일 보기

파일: hotwordSpider.py 프로젝트: hedgehogBoby/beyebe-spider-hedgehogBoby

    def parse_item(self, response):
        i = 0
        bodys = response.xpath('//table[@class="list-table"]/tr')
        for body in bodys:
            if body.xpath('.//td[@class="first"]').extract():
                items = {}
                num = body.xpath(
                    './/td[@class="first"]/span/text()').extract_first()
                title = body.xpath(
                    './/td[@class="keyword"]/a/text()').extract_first()
                href = body.xpath(
                    './/td[@class="keyword"]/a/@href').extract_first()
                focus_num = body.xpath(
                    './/td[@class="last"]/span/text()').extract_first()
                items['index'] = num
                items['title'] = title
                items['news_type'] = '百度' + response.meta['news_type']
                items['url'] = href
                items['num'] = int(focus_num)
                items['focus_num'] = focus_num
                items['resource'] = '百度'
                print(items)
                i = i + 1
                try:
                    missionBean = MissionBean(href, 500, ['train_hotword'])
                    missionBean.title = title
                    missionBean.info = items
                    self.client.save(missionBean)

                except:
                    print("存储数据库出现异常")
                    traceback.print_exc()
        print('本次抓取个数{}'.format(i))
        self.sleepMyself()

예제 #3

0

파일 보기

파일: qutoutiaoSpider.py 프로젝트: hedgehogBoby/beyebe-spider-hedgehogBoby

    def parse_item(self, response):
        info = response.request.info
        html = response.text
        bs4 = BeautifulSoup(html, "html.parser")
        content = bs4.select_one('div[class=\"content\"]').prettify()
        info['content'] = content
        missionBean = MissionBean(response.url, 1001, ['qutoutiao'])
        missionBean.info = info
        missionBean.html = html
        missionBean.title = info['title']
        # 组装正式版Bean
        newsBean = NewsBean()
        newsBean.titleInfo = info['title']
        newsBean.content = info['content']
        newsBean.url = response.url
        newsBean.newsId = info['id']
        newsBean.tags = info['tag']

        newsBean.etc = {'news_type': info['type']}
        newsBean.fromChannel = self.TYPE_DICT.get(int(info['type']), '其他')
        newsBean.fromSpider = '推荐流'
        newsBean.fromType = 8
        newsBean.goodNum = int(info['like_num'])
        newsBean.commentNum = int(info['comment_count'])
        newsBean.readNum = int(info['read_count'])
        newsBean.mediaName = info['source_name']
        newsBean.mediaId = info['source_name']
        newsBean.introduction = info['introduction']
        newsBean.imgUrls = info['cover']
        newsBean.shareNum = info['share_count']
        missionBean.info = newsBean.__dict__
        # 其中publishDate和createTime由于redis的格式问题
        # TODO 只能传递时间戳
        newsBean.publishDate = datetime.datetime.fromtimestamp(
            int(info['publish_time']) / 1000).timestamp()
        newsBean.createTime = newsBean.createTime.timestamp()
        daoFilterAndSave.MongoFilterSave(missionBean)

예제 #4

0

파일 보기

파일: fishingSpider.py 프로젝트: hedgehogBoby/beyebe-spider-hedgehogBoby

 def parse_item(self, response):
     info = response.request.info
     html = response.body.decode()
     match = self.get_addr(html)
     if len(match) > 0:
         info['videoUrl'] = match[0]
     else:
         return
     bs4 = BeautifulSoup(response.text, 'html.parser')
     info['img'] = bs4.select_one("div[id=\"poster\"]").select_one('img')['src']
     missionBean = MissionBean(response.url, 3, ['fishing_new'])
     missionBean.html = html
     missionBean.title = info['title']
     missionBean.info = info
     self.client.save(missionBean)

예제 #5

0

파일 보기

파일: hotwordSpider.py 프로젝트: hedgehogBoby/beyebe-spider-hedgehogBoby

    def parse(self, response):
        try:

            if 'top.baidu' in response.url:
                modes = response.xpath(
                    '//div[@class="hblock"]/ul/li/a/@href').extract()
                for mode in modes[1:]:
                    news_type = response.xpath(
                        '//div[@class="hblock"]/ul/li[{}]/a/@title'.format(
                            str(1 + modes.index(mode)))).extract_first()
                    yield Request(url=self.baidu_mainurl + mode[1:],
                                  callback=self.parse_item,
                                  dont_filter=True,
                                  meta={'news_type': news_type},
                                  priority=2)
            if 'weibo' in response.url:
                rhtml = response.xpath('//script/text()').extract(
                )  # 变量瞎定义的，大家将就着看，获取整个页面的script的字符串信息。
                htm = rhtml[8]  # 获取目标ID为realtimehot的Table的脚本信息，为什么是8呢？我在页面数的。
                start = htm.find("(")
                substr = htm[start + 1:-1]  # 截取脚本里面的json串信息。
                html = json.loads(substr)['html']
                bs4 = BeautifulSoup(html, 'html.parser')
                trTags = bs4.select('tr[action-type=\"hover\"]')
                print("发现潜在词数量", len(trTags))
                for trTag in trTags:
                    dictInfo = {}
                    dictInfo['index'] = trTag.find('em').string

                    dictInfo['title'] = trTag.find('p',
                                                   class_='star_name').a.string
                    dictInfo['url'] = trTag.find(
                        'p', class_='star_name').a.get('href')
                    dictInfo['resource'] = '微博'
                    try:
                        dictInfo['num'] = int(
                            trTag.find('p', class_='star_num').span.string)
                    except:
                        dictInfo['num'] = -1
                    missionBean = MissionBean(dictInfo['url'], 501,
                                              ['train_hotword'])
                    missionBean.title = str(dictInfo['title'])
                    if 'realtimehot' in response.url:
                        missionBean.info = {'news_type': '微博热搜'}
                    if 'socialevent' in response.url:
                        missionBean.info = {'news_type': '微博新时代'}
                    missionBean.info.update(dictInfo)
                    print(missionBean.title)
                    self.client.save(missionBean)
            if 'news.163' in response.url:
                typeName0 = '163'
                bs4 = BeautifulSoup(response.text, "html.parser")
                items = bs4.select_one("div[class=\"area areabg1\"]")
                i = 0
                for titleBarTag in items.select("div[class=\"titleBar\"]"):
                    # 这个网站比较奇怪，是并列关系，第n个titleBar对应第n个left和right
                    typeName1 = titleBarTag.select_one("h2").get_text()  # 分类名
                    """
                    左侧分类【点击榜】
                    """
                    areaLeftTag = items.select(
                        'div[class=\"area-half left\"]')[i]
                    typeName2 = areaLeftTag.select_one("h2").get_text()
                    liTags = items.select(
                        "div[class=\"title-tab\"]")[i].select('li')
                    j = 0
                    for li in liTags:
                        typeName3 = li.get_text()
                        # print(str(areaLeftTag))
                        tableTag = areaLeftTag.select('table')[j]
                        for newsTag in tableTag.select("tr"):
                            # 标题行不抓取
                            if "标题" in newsTag.get_text():
                                continue
                            infoDict = {}
                            infoDict['title'] = newsTag.select_one(
                                'a').get_text()
                            infoDict['index'] = int(
                                newsTag.select("td")[0].select_one(
                                    "span").get_text())
                            infoDict['num'] = int(
                                newsTag.select("td")[1].get_text())
                            infoDict['upOrDown'] = -1
                            infoDict['url'] = newsTag.select_one('a')['href']
                            infoDict[
                                'news_type'] = typeName0 + typeName1 + typeName2 + typeName3
                            infoDict['resource'] = '163'
                            missionBean = MissionBean(response.url, 510,
                                                      ['train_hotword'])
                            missionBean.info = infoDict
                            self.client.save(missionBean)
                        j = j + 1
                    """
                    右侧分类【跟帖榜】
                    """
                    areaLeftTag = items.select(
                        'div[class=\"area-half right\"]')[i]
                    typeName2 = areaLeftTag.select_one("h2").get_text()
                    liTags = items.select(
                        "div[class=\"title-tab\"]")[i].select('li')
                    j = 0
                    for li in liTags:
                        typeName3 = li.get_text()
                        # print(str(areaLeftTag))
                        tableTag = areaLeftTag.select('table')[j]
                        for newsTag in tableTag.select("tr"):
                            # 标题行不抓取
                            if "标题" in newsTag.get_text():
                                continue
                            infoDict = {}
                            infoDict['title'] = newsTag.select_one(
                                'a').get_text()
                            infoDict['index'] = int(
                                newsTag.select("td")[0].select_one(
                                    "span").get_text())
                            infoDict['num'] = int(
                                newsTag.select("td")[1].get_text())
                            infoDict['upOrDown'] = -1
                            infoDict['url'] = newsTag.select_one('a')['href']
                            infoDict[
                                'news_type'] = typeName0 + typeName1 + typeName2 + typeName3
                            infoDict['resource'] = '163'
                            missionBean = MissionBean(response.url, 511,
                                                      ['train_hotword'])
                            missionBean.info = infoDict
                            self.client.save(missionBean)
                        j = j + 1
                    i = i + 1
            if 'api.1sapp' in response.url:
                jsonBean = json.loads(response.text)
                print(jsonBean)
                for i, news in enumerate(jsonBean['data']['data']):
                    items = {}
                    parseUrl = urlparse(response.url)
                    strParseQs = parseUrl[4]
                    res = parse.parse_qs(strParseQs)
                    pageNum = int(res.get('page')[0])
                    limitNum = int(res.get('limit')[0])
                    items['index'] = i + 1 + (pageNum - 1) * limitNum
                    items['title'] = news['title']
                    items['news_type'] = '趣头条推荐流'
                    items['url'] = news['url']
                    items['num'] = 10000 - items['index']
                    items['num1'] = int(news['read_count'])
                    items['num2'] = int(news['share_count'])
                    items['resource'] = '趣头条'
                    missionBean = MissionBean(items['url'], 500,
                                              ['train_hotword'])
                    missionBean.title = items['title']
                    missionBean.info = items
                    self.client.save(missionBean)
        except:
            traceback.print_exc()
        finally:
            print("正在添加新任务至队列头部")
            request = Request(url=response.url, dont_filter=True)
            yield request