Exemplo n.º 1
0
 def parse(self, response):
     item_list = response.xpath("//div[@class='post_item']")
     for item in item_list:
         blogItem = BaiduItem()
         # blogItem['title'] = item.xpath('./div[2]/h3[1]/a/text()')[0].extract().strip()
         # blogItem['titleLink'] = item.xpath('./div[2]/h3[1]/a/@href')[0].extract()
         # blogItem['suggestNum'] = item.xpath('./div[1]/div[1]/span/text()')[0].extract().strip()
         # content = item.xpath('./div[2]/p[1]/text()')
         # for c in content:
         #     if c.extract().strip() != '':
         #         blogItem['content'] = c.extract().strip()
         #     else:
         #         blogItem['content'] = ''
         # blogItem['author'] = item.xpath('./div[2]/div[1]/a/text()')[0].extract().strip()
         # blogItem['publishDate'] = item.xpath('./div[2]/div[1]/text()')[1].extract().strip()
         yield blogItem
     # handle next page
     next = response.xpath(
         "//div[@class='pager']/a[last()]/text()")[0].extract().strip()
     print(next)
     if 'Next' in next:
         nextUrlStr = response.xpath(
             "//div[@class='pager']/a[last()]/@href")[0].extract().strip()
         nextNum = nextUrlStr[-1]
         nextNum = 2
         if int(nextNum) < 5:
             nextUrl = 'https://www.cnblogs.com/#p/{0}'.format(nextNum)
             print(nextUrl)
             nextNum += 1
             yield scrapy.Request(nextUrl, callback=self.parse)
Exemplo n.º 2
0
 def parse_rel(self, html):
     """解析相关搜索"""
     item = BaiduItem()
     for row in html.find_all('a'):
         item['url'] = row['href']
         item['title'] = row.get_text().strip()
         yield item
Exemplo n.º 3
0
    def parse(self, response):
        print(response.text)

        # 模拟浏览器
        self.driver.get("http://www.baidu.com")
        # 根据id找到百度的输入框
        input_element = self.driver.find_element_by_id('kw')
        # 模拟输入搜索内容
        input_element.send_keys("爬虫")
        # 根据id找到百度的确认按钮
        click_button = self.driver.find_element_by_id('su')
        #模拟点击事件
        click_button.click()
        # 等5s
        self.driver.implicitly_wait(5)
        # 根据classname获取列表数据
        result_list = self.driver.find_elements_by_class_name('result')
        for result in result_list:
            item = BaiduItem()
            print("列表标题:" + result.text)
            print(
                "==========================================================================="
            )
            item["name"] = result.text
            yield item
Exemplo n.º 4
0
    def parse_lyric(self, response):
        hxs = HtmlXPathSelector(response)
        item = BaiduItem()
        content = hxs.select('//*[@id="lyricCont"]/text()')

        ##lyric
        lyric = ''
        for word in content:
            lyric += word.extract()
        item['lyric'] = lyric

        ## hot: define how popular the song
        hot = hxs.select('//span[@class="num"]/text()').extract()[0]
        item['hot'] = hot

        ## singer
        singer = hxs.select(
            '//span[@class="author_list"]/a/text()').extract()[0]
        patt = '\s'
        item['singer'] = re.sub(patt, '', singer)

        ## songName
        songName = hxs.select(
            '//ul[@class="path-list clearfix"]/li[4]/text()').extract()[0]
        item['songName'] = songName
        return item
 def parse_item(self, response):
     sel = Selector(response)
     items = []
     item = BaiduItem()
     title = sel.xpath('//span[@class="ask-title "]/text()').extract()
     for i in title:
         items.append(i)
     item['TitleName'] = items
     return item
Exemplo n.º 6
0
 def parse(self, response):
     filename = response.url.split(".")[-1] + '.txt'
     for h2 in response.xpath('//article[@class="entry-common clearfix"]'):
         item = BaiduItem()
         item['title'] = str.strip(
             h2.xpath('header/h2/a/text()').extract()[0])
         item['url'] = h2.xpath('header/h2/a/@href').extract()[0]
         # item['desc'] =str.strip(h2.xpath('header/div[2]/span/text()').extract()[0])
         yield item
Exemplo n.º 7
0
 def parse(self, response):
     jsonresp = json.loads(response.body_as_unicode())
     for o in jsonresp['data']:
         if 'thumbURL' in o:
             l = ItemLoader(item=BaiduItem(), response=response)
             #image_url
             l.add_value('image_urls', o['thumbURL'])
             l.add_value('image_type', tag)
             yield l.load_item()
Exemplo n.º 8
0
	def get_tab2(self, response):
		items = []
		for rank in response.xpath('./div[@class="grayborder"]/table[@class="list-table"]/tbody/tr[7]/following-sibling::tr'):
			item = BaiduItem()
			item['rank_category'] = 'zongyi'
			item['rank_name'] = rank.xpath('./td[@class="keyword"]/a[@class="list-title"]/text()').extract()
			item['rank_index'] = rank.xpath('./td[@class="last"]/span/text()').extract()
			items.append(item)
		return items
Exemplo n.º 9
0
    def parse_week(self, browser):
        # TODO: 定义变量,收集结果
        lst=[]
        for actived in ['搜索指数','资讯指数']:
            # import ipdb; ipdb.set_trace()
            # act=browser.find_element_by_xpath('//*[text()[contains(.,"搜索指数")]]').text
            
            if actived == '搜索指数':
                browser.find_element_by_xpath('//*[text()[contains(.,"搜索指数")]]').click()
                
            else:
                browser.find_element_by_xpath('//*[text()[contains(.,"资讯指数")]]').click()
                
            
            try:
                items = browser.find_elements_by_css_selector('.tab-content .list > .list-item')
                # import ipdb; ipdb.set_trace()
                data = browser.find_elements_by_css_selector('.date-text')[0].text
                one = browser.find_elements_by_css_selector('.actived')[0].text
                for item in items:
                    # 排名
                    rank = item.find_element_by_css_selector('.content .rank').text.strip()
                    # 姓名
                    name = item.find_element_by_css_selector('.content .name').text.strip()
                    # 行指数
                    line = item.find_element_by_css_selector('.content .line-light')
                    # 行指数-真实值
                    real_value = float(line.value_of_css_property('width').replace('px', ''))
                    # 行指数-最大值元素
                    line_max = line.find_element_by_xpath('..')
                    # 最大值
                    max_value = float(line_max.value_of_css_property('width').replace('px', ''))
                    # 指数值
                    index = round(100 * real_value / max_value, 2)
                    # 指数字符串
                    index_str = str(index).rstrip('0').rstrip('.') + '%'


                    cc=BaiduItem()
                    cc['one']=one
                    cc['data']=data
                    cc['actived']=actived
                    cc['rank']=rank
                    cc['name']=name
                    cc['index_str']=index_str
                    print(index_str)
                    print('-' * 30)
                    # yield cc
                    lst.append(cc)
                    # import ipdb; ipdb.set_trace()
            except (NoSuchElementException, StaleElementReferenceException):
                import ipdb; ipdb.set_trace()
                pass
            # TODO: 返回结果
            
        return lst
 def parse(self, response):
     sel = Selector(response)
     items = []
     item = BaiduItem()
     title = sel.xpath('//div[@class="question-title"]/a/text()').extract()
     for i in title:
         items.append(i)
     item['TitleName'] = items
     # print(item['TitleName'])
     return item
Exemplo n.º 11
0
 def parse_week(self, browser, nav_name, someweek, exponent):
     '''
         搜素指数
     '''
     # 定义变量,收集结果
     # import ipdb; ipdb.set_trace()
     result = []
     try:
         # 获取每一页的每一行的数据
         items = browser.find_elements_by_css_selector('.tab-content .list > .list-item')
         # 分别便利获取每个娱乐圈人的数据
         for item in items:
             baidu = BaiduItem()
             # 排名
             baidu['rank'] = item.find_element_by_css_selector('.content .rank').text.strip()
             # 姓名
             baidu['name'] = item.find_element_by_css_selector('.content .name').text.strip()
             if nav_name == '周上升榜':
                 # 趋势
                 # import ipdb; ipdb.set_trace()
                 trend = item.find_element_by_xpath('//div[@class="trend"]/span').text
                 baidu['trend'] = trend
             else:
                 # 趋势
                 trend = item.find_element_by_xpath('//span[@class="trend-icon"]/i').get_attribute('class')
                 baidu['trend'] = re.sub('icon trend-', '', trend)
             # 行指数
             line = item.find_element_by_css_selector('.content .line-light')
             # 行指数-真实值
             real_value = float(line.value_of_css_property('width').replace('px', ''))
             # 行指数-最大值元素
             line_max = line.find_element_by_xpath('..')
             # 最大值
             max_value = float(line_max.value_of_css_property('width').replace('px', ''))
             # 指数值
             index = round(100 * real_value / max_value, 2)
             # 指数字符串
             index_str = str(index).rstrip('0').rstrip('.') + '%'
             # 指数比例
             baidu['index_str'] = index_str
             # 棒单名
             baidu['nav_name'] = nav_name
             # 搜索指数 和资讯指数
             baidu['exponent'] = exponent
             # 日期
             baidu['week'] = someweek
             result.append(baidu)
             print(nav_name, '--->', exponent, '--->', someweek, '--->', index_str)
             print('-' * 30)
             # import ipdb; ipdb.set_trace()
     except (NoSuchElementException, StaleElementReferenceException):
         import ipdb; ipdb.set_trace()
     # 返回结果
     return result
Exemplo n.º 12
0
    def parse(self, response):
        item = BaiduItem()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text
        subhead = soup.select('.lemmaWgt-lemmaTitle-title h2')
        if len(subhead) is not 0:
            # print(subhead[0].text)
            title = title + subhead[0].text
        item['title'] = title

        info_list = soup.select('.lemma-summary div')
        info = ''
        for temp in info_list:
            # 截取其中文字信息
            info += temp.text
            # 如果有超链接,则继续爬取
            a_list = temp.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['info'] = info

        properties_list = soup.select('.basicInfo-block dt')
        properties = ''
        for pro in properties_list:
            properties += '###' + pro.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = pro.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['properties'] = properties

        values_list = soup.select('.basicInfo-block dd')
        values = ''
        for val in values_list:
            values += '###' + val.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = val.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['values'] = values

        if len(soup.select('.summary-pic img')) is not 0:
            item['img'] = soup.select('.summary-pic img')[0]['src']

        print(item['title'])

        yield item
Exemplo n.º 13
0
 def parse(self, response):
     # 将响应内容生成Selector,用于数据清洗
     sel = Selector(response)
     items = []
     # 定义BaiduItem对象
     item = BaiduItem()
     title = sel.xpath('//div[@class="question-title"]/a/text()').extract()
     for i in title:
         items.append(i)
     item['TitleName'] = items
     return item
Exemplo n.º 14
0
 def DownlordImg(self, response):
     img = BaiduItem()
     content = response.xpath("//*[@id='big-pic']/p/a/img")
     if content:
         img['image_urls'] = content.xpath(".//@src").extract()
         yield img
     links = response.xpath("/html/body/div[3]/div[2]/div[9]/ul/li")
     for j in links:
         if j.xpath(".//a/text()").extract_first() == '下一页':
             next_link = 'https://www.aitaotu.com/' + j.xpath(
                 ".//a/@href").extract_first()
             yield scrapy.Request(next_link, callback=self.DownlordImg)
Exemplo n.º 15
0
    def parse(self, response):

        num = 0
        company = response.meta["company"]
        list_content = response.css("#content_left").xpath("./div")
        for i in list_content:
            url = i.css(".t").xpath("./a/@href").get()
            title = i.css(".t").xpath("./a").xpath("string(.)").get()
            summary = i.css(".c-abstract").xpath("string(.)").get()

            item = BaiduItem()

            if url and title and summary:
                num+=1
                item["url"] = url
                item["title"] = title
                item["summary"] = summary
                yield item
        if num <10:
            item1 = BaiduItem()
            item1["error"] = company

            return item1
Exemplo n.º 16
0
 def get_tieba(self, response):
     '''
     根据分类爬取此类型所有的贴吧信息
     '''
     tiebas = response.xpath('//*[@id="ba_list"]/div')
     for tieba in tiebas:
         item = BaiduItem()
         # 初始化item并进行赋值保存
         try:
             try:
                 item['title'] = response.meta['title']
             except KeyError:
                 item['title'] = 'title invalid!'
             try:
                 item['tag'] = response.meta['tag']
             except:
                 item['tag'] = 'tag invalid!'
             item['name'] = tieba.xpath(
                 'a/div/p[@class="ba_name"]/text()').extract()[0]
             item['img'] = tieba.xpath('a/img/@src').extract()[0]
             item['user_num'] = tieba.xpath(
                 'a/div/p[@class="ba_num clearfix"]/span[@class="ba_m_num"]/text()'
             ).extract()[0]
             item['topic_num'] = tieba.xpath(
                 'a/div/p[@class="ba_num clearfix"]/span[@class="ba_p_num"]/text()'
             ).extract()[0]
             try:
                 item['description'] = tieba.xpath(
                     'a/div/p[@class="ba_desc"]/text()').extract()[0]
             except KeyError:
                 item['description'] = 'not find description!'
             except IndexError:
                 item['description'] = 'not find description!'
             item['url'] = 'http://tieba.baidu.com' + tieba.xpath(
                 'a/@href').extract()[0]
         except Exception as e:
             print('------------------------')
             print(e)
         yield item
     next_url = response.xpath('//*[@class="next"]/@href').extract()
     if next_url:
         # 按段是否存在下一页,存在则翻页抓取
         url = 'http://tieba.baidu.com/' + next_url[0]
         # print(url)
         yield Request(url,
                       callback=self.get_tieba,
                       meta={
                           'title': response.meta['title'],
                           'tag': response.meta['tag']
                       })
Exemplo n.º 17
0
 def parse(self, response):
     url = response.url
     title = response.xpath(
         "//*[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()[0]
     new_urls = response.xpath(
         "//*[starts-with(@href,'/item/')]/@href").extract()
     summary = response.xpath("//*[@class='lemma-summary']/div")
     summary = summary.xpath('string(.)').extract()[0]
     item = BaiduItem(url=url, title=title, summary=summary)
     yield item
     for url in new_urls:
         self.x += 1
         if self.x > 100:
             break
         self.url = url
         yield Request(response.urljoin(url=url), callback=self.parse)
Exemplo n.º 18
0
 def parse(self, response):
     div_list = response.xpath('//div[@class="t_con cleafix"]/div[2]/div')
     for div in div_list:
         item = BaiduItem()
         item["author"] = div.xpath('./div[2]/span/@title').extract_first()
         item["title"] = div.xpath("./div/a/text()").extract_first()
         item["href"] = div.xpath("./div/a/@href").extract_first()
         item["time"] = div.xpath("./div[2]/span[2]/text()").extract_first()
         if item["author"] and item["title"] and item["href"]:
             item["href"] = "http://tieba.baidu.com" + item['href']
             yield scrapy.Request(item["href"],
                                  callback=self.parse_detail,
                                  meta=item)
     next_url = response.xpath(
         '//a[@class="next pagination-item "]/@href').extract_first()
     if next_url is not None:
         next_url = response.urljoin(next_url)
         yield scrapy.Request(next_url, callback=self.parse)
Exemplo n.º 19
0
    def parse(self, response):
        sel = Selector(response)
        sel_0 = sel.xpath('/html/body/div[@id="body"]')
        sel_1 = sel_0.xpath('.//ul/li')
        base_url = get_base_url(response)
        no = 0
        for site in sel_1:
            no += 1
            title = site.xpath('a/text()').extract()
            if (len(title) != 0 and len(''.join(title)) > 4):
                link = site.xpath('a/@href').extract()

                url_m = (str(link))[3:-2]
                url_new = urljoin(base_url, url_m)
                item = BaiduItem(title=title,
                                 link=link,
                                 manufacturer='baiduguonei')

                yield scrapy.Request(url_new,
                                     callback=self.parse_body,
                                     meta={'item': item})
Exemplo n.º 20
0
    def parse_search(self, response):
        import hashlib
        params = response.meta.get('params')
        keywords = response.meta.get('keywords')
        timestamp = json.loads(response.body_as_unicode())['timestamp']
        json_res = json.loads(response.body_as_unicode())['data']
        for news in json_res.get('list'):
            item = BaiduLoader(item=BaiduItem())
            # 只爬取最近7天的新闻
            if timestamp - news.get('publicTime') > 7*24*3600:
                return None
            item.add_value('post_time',
                           time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime(float(news.get('publicTime')))))
            item.add_value('search_word', params.get('word'))
            item.add_value('author', news.get('author'))
            item.add_value('title', news.get('title'))
            item.add_value('link', news.get('url'))
            item.add_value('hash_id', hashlib.md5(news.get('url').encode('utf8')).hexdigest())
            item.add_value('cover_urls', news.get('imgUrl'))  # cover image url
            intro = item.load_item()
            new_url = 'https://news.baidu.com/news?tn=bdapiinstantfulltext&src=' + news.get('url')

            if keywords is None:
                print('error')
                print(news.get('url'))
            yield Request(url=new_url,
                          meta={'intro': intro,
                                'enter_url': news.get('url'),
                                'keywords': response.meta.get('keywords').split(','),
                                },
                          callback=self.parse)
        total = json_res.get('total')
        is_continue = len(json_res.get('list')) == params['rn'] and params['pn'] < total
        if is_continue:
            params['pn'] = params['pn'] + params['rn']
            url = "http://m.news.baidu.com/news?tn=bdapinewsearch&" + urllib.parse.urlencode(params)
            yield scrapy.Request(url=url,
                                 meta={'params': params, 'keywords': keywords},
                                 callback=self.parse_search)
Exemplo n.º 21
0
 def parse(self, response):
     try:
         print("进入try..........................")
         item = BaiduItem()
         item['comment_num'] = re.findall(
             r'<span class="red" style="margin-right:3px">(.+?)</span>',
             response.text)
         item['read_num'] = ''
         item['up_num'] = ''
         item['zhuanfa'] = ''
         item['url'] = response.meta.get('url')
         item["MonitorName"] = response.meta.get('MonitorName')
         if item['comment_num'] == []:
             item['comment_num'] = ''
             print("---------------------------------")
         else:
             item['comment_num'] = item['comment_num'][0]
             print(item["url"], item["MonitorName"], item['comment_num'],
                   "====================")
     except:
         print("进入except.......................")
     yield item
Exemplo n.º 22
0
    def parse_phone(self, response):
        item = BaiduItem()
        sort = 0
        soup = bs(response.body, 'lxml')
        urlarr = self.get_url_query(response.url)
        search = urlarr['wd'] if 'wd' in urlarr.keys() else urlarr['word']
        if soup.find('div', class_='rw-list') is not None:
            rel = self.parse_rel(soup.find('div', class_='rw-list'))
        else:
            rel = None

        if soup.find('div', class_='hint-toprq-tips') is not None:
            rec = self.parse_rel(soup.find('div', class_='hint-toprq-tips'))
        else:
            rec = None

        if soup.find('div', id='page-controller').find(
                'a', class_='new-nextpage') is not None:
            next_page_url = soup.find('div', id='page-controller').find(
                'a', class_='new-nextpage')['href']
        else:
            next_page_url = soup.find('div', id='page-controller').find(
                'a', class_='new-nextpage-only')['href']
        if not re.search('ms\=1', next_page_url):
            next_page_url = next_page_url + '&ms=1'
        connects = soup.find('div', id='results').find_all('div',
                                                           class_='c-result')
        for index, connect in enumerate(connects):
            if (self.page_num == 1):
                url = 'https://m.baidu.com/su?pre=1&p=3&json=1&wd=%s&sugmode=2&_=1493098255100' % search
                recommendList = self.get_recommend(url)
                recLen = len(recommendList)
                if recLen - 1 >= index:
                    item['recom_search'] = recommendList[index]
                else:
                    item['recom_search'] = ''
            else:
                item['recom_search'] = ''
            sort += 1
            tag_a = connect.find('div', class_='c-container').find('a')
            item['title'] = tag_a.get_text()
            if rel is not None:
                try:
                    data = next(rel)
                    item['rel_url'] = response.url + data['url']
                    item['rel_tit'] = data['title']
                except StopIteration:
                    item['rel_url'] = ''
                    item['rel_tit'] = ''
            else:
                item['rel_url'] = ''
                item['rel_tit'] = ''
            if rec is not None:
                try:
                    data = next(rec)
                    item['rec_url'] = self.parse_url(response.url +
                                                     data['url'])
                    item['rec_tit'] = data['title']
                except StopIteration:
                    item['rec_url'] = ''
                    item['rec_tit'] = ''
            else:
                item['rec_url'] = ''
                item['rec_tit'] = ''
            if connect['data-log'] is not None:
                data_log = json.loads(connect['data-log'].replace("'", '"'))
                item['sort'] = data_log['order']
                if data_log['mu'] != '':
                    item['url'] = data_log['mu']
                else:
                    item['url'] = self.parse_url(tag_a['href'])
            else:
                item['url'] = self.parse_url(tag_a['href'])
            if connect.find('span', class_='c-gray') is not None:
                item['time'] = connect.find('span', class_='c-gray').get_text()
            else:
                item['time'] = ''
            item['page'] = '第%s页' % self.page_num

            # self.write_log(tag_a.get_text())
            yield item
        if next_page_url != '' and self.page_num < self.total_page:
            self.page_num += 1
            yield scrapy.Request(url=next_page_url, callback=self.parse_phone)
Exemplo n.º 23
0
    def parse_pc(self, response):
        """提取相关数据"""
        sort = 0
        item = BaiduItem()
        soup = bs(response.body, 'lxml')
        urlarr = self.get_url_query(response.url)
        search = urlarr['wd'] if urlarr['wd'] else urlarr['word']
        connects = soup.find_all(attrs={"class": "c-container"})
        if soup.find('span', 'hint_toprq_tips_items') is not None:
            rec = self.parse_rec(soup.find('span', 'hint_toprq_tips_items'))
        else:
            rec = None
        if soup.find('div', id='rs') is not None and sort == 0:
            rel = self.parse_rel(soup.find('div', id='rs'))
        else:
            rel = None
        for index, connect in enumerate(connects):
            if (self.page_num == 1):
                url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&cb=&json=1' % search
                recommendList = self.get_recommend(url)
                recLen = len(recommendList)
                if recLen - 1 >= index:
                    item['recom_search'] = recommendList[index]
                else:
                    item['recom_search'] = ''
            else:
                item['recom_search'] = ''
            sort += 1
            if connect.find('h3', 't') is None:
                item['title'] = connect.find('a').get_text()
                item['url'] = connect.find('a')['href']
            else:
                item['title'] = connect.find('h3').get_text().strip()
                item['url'] = self.parse_url(connect.find('h3').a['href'])
            item['page'] = '第%s页' % self.page_num
            item['sort'] = sort
            if connect.find('span', ' newTimeFactor_before_abs m') is not None:
                item['time'] = connect.find(
                    'span', ' newTimeFactor_before_abs m').get_text().replace(
                        u'\xa0-\xa0', '')
            else:
                item['time'] = ''

            if rec is not None:
                try:
                    data = next(rec)
                    item['rec_url'] = self.parse_url(response.url +
                                                     data['url'])
                    item['rec_tit'] = data['title']
                except StopIteration:
                    item['rec_url'] = ''
                    item['rec_tit'] = ''
            else:
                item['rec_url'] = ''
                item['rec_tit'] = ''

            if rel is not None:
                try:
                    data = next(rel)
                    item['rel_url'] = response.url + data['url']
                    item['rel_tit'] = data['title']
                except StopIteration:
                    item['rel_url'] = ''
                    item['rel_tit'] = ''
            else:
                item['rel_url'] = ''
                item['rel_tit'] = ''
            yield item

        if self.page_num < self.total_page:
            self.page_num += 1
            next_url = response.urljoin(
                soup.find('div',
                          id='page').find('strong').next_sibling['href'])
            yield scrapy.Request(next_url, callback=self.parse_pc)
Exemplo n.º 24
0
    def parse_week(self, browser, ban, index_t):
        # TODO: 定义变量,收集结果
        data = []
        try:
            # 第几周
            zhou = browser.find_element_by_css_selector(
                '.date-text').text.strip()
            items = browser.find_elements_by_css_selector(
                '.tab-content .list > .list-item')
            for item in items:
                # 排名
                rank = item.find_element_by_css_selector(
                    '.content .rank').text.strip()
                # 姓名
                name = item.find_element_by_css_selector(
                    '.content .name').text.strip()
                # 判断是否是‘周上升榜’
                if ban.text == '周上升榜':
                    # 上升位数
                    index_str = item.find_element_by_css_selector(
                        '.value.upvalue').text.strip()
                else:
                    # 行指数
                    line = item.find_element_by_css_selector(
                        '.content .line-light')
                    # 行指数-真实值
                    real_value = float(
                        line.value_of_css_property('width').replace('px', ''))
                    # 行指数-最大值元素
                    line_max = line.find_element_by_xpath('..')
                    # 最大值
                    max_value = float(
                        line_max.value_of_css_property('width').replace(
                            'px', ''))
                    # 指数值
                    index = round(100 * real_value / max_value, 2)
                    # 指数字符串
                    index_str = str(index).rstrip('0').rstrip('.') + '%'

                trend = None
                if len(item.find_elements_by_css_selector(
                        '.icon.trend-fair')) > 0:
                    trend = 'fair'
                elif len(item.find_elements_by_css_selector(
                        '.icon.trend-down')) > 0:
                    trend = 'down'
                else:
                    trend = 'up'

                print('排名:{0}, 姓名:{1}, {2}:{3}, {4}:{5}'.format(
                    rank, name, ban.text, zhou, index_t.text, index_str))
                print('-' * 30)
                db_item = BaiduItem()
                db_item['rank'] = rank
                db_item['name'] = name
                db_item['index'] = index_str
                db_item['trend'] = trend
                db_item['tab_item'] = ban.text
                db_item['index_type'] = index_t.text
                db_item['zhou'] = zhou
                data.append(db_item)
        except (NoSuchElementException, StaleElementReferenceException):
            import ipdb
            ipdb.set_trace()
        # TODO: 返回结果
        return data
Exemplo n.º 25
0
    def parse(self, response, typeid, category):

        print "---> begin execute parse method"

        item = BaiduItem()
        items = []
        #        print response.body

        print "---> tags: " + typeid

        data = response.body
        # print data

        print "---> category: " + category

        s = json.loads(data)

        #        print s

        MyData = s["data"]

        #        print "Data++++++++"

        #        print data

        num = len(MyData)

        for i in xrange(0, num - 1):

            #            print MyData[i]

            print '\n--------------------------------------------------------------------------%d---------------------------------------------------\n' % (
                i)

            print '---> now is : ' + str(
                time.strftime('%Y-%m-%d %H:%M %S', time.localtime(
                    time.time())))

            # print MyData[i]["hoverURL"]

            width = MyData[i]["width"]
            height = MyData[i]["height"]

            size = str(width) + "*" + str(height)

            print '---> title: ' + MyData[i]["fromPageTitleEnc"]

            #测试下载使用gif地址
            # image = str(MyData[i]["middleURL"])
            # print "image------->" + image

            simid = MyData[i]["os"]
            Hash = simid.split(",")[0]

            #            now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
            now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

            item['imgsize'] = size
            item['imgid'] = Hash
            item['category'] = category
            item['tag'] = typeid
            item['updateTime'] = now
            item['scrawl_time'] = now
            item['title'] = MyData[i]["fromPageTitleEnc"]
            item['fromURLHost'] = MyData[i]["fromURLHost"]
            item['author'] = MyData[i]["fromURLHost"]

            ObjURL = MyData[i]["objURL"]

            fromURL = MyData[i]["fromURL"]

            e = {
                "w": "a",
                "k": "b",
                "v": "c",
                "1": "d",
                "j": "e",
                "u": "f",
                "2": "g",
                "i": "h",
                "t": "i",
                "3": "j",
                "h": "k",
                "s": "l",
                "4": "m",
                "g": "n",
                "5": "o",
                "r": "p",
                "q": "q",
                "6": "r",
                "f": "s",
                "p": "t",
                "7": "u",
                "e": "v",
                "o": "w",
                "8": "1",
                "d": "2",
                "n": "3",
                "9": "4",
                "c": "5",
                "m": "6",
                "0": "7",
                "b": "8",
                "l": "9",
                "a": "0",
                "_z2C$q": ":",
                "_z&e3B": ".",
                "AzdH3F": "/"
            }

            #破译obj编码 使用字符替换 没有使用正则(正则学的太烂) TODO:正则

            print ObjURL

            ekeys = e.keys()

            #            print ekeys

            strLength = len(ekeys)

            URL_O = ObjURL
            URL_O = URL_O.replace("_z2C$q", ":")
            URL_O = URL_O.replace("_z&e3B", ".")
            URL_O = URL_O.replace("AzdH3F", "/")
            URLLength = len(URL_O)
            print URL_O

            URL_F = fromURL
            URL_F = URL_F.replace("_z2C$q", ":")
            URL_F = URL_F.replace("_z&e3B", ".")
            URL_F = URL_F.replace("AzdH3F", "/")
            URL_FLength = len(URL_F)
            s_f = ""

            #解析fromURL TODO
            for j in xrange(0, URL_FLength):
                URLKey = URL_F[j]
                url = ord(URLKey)
                if (url >= ord('a')
                        and url <= ord('w')) or (url >= ord('0')
                                                 and url <= ord('9')):
                    str_url = e[str(URLKey)]
                    s_f = s_f + str_url
                else:
                    s_f = s_f + URLKey
            print s_f

            s = ""
            for j in xrange(0, URLLength):
                URLKey = URL_O[j]
                url = ord(URLKey)
                if (url >= ord('a')
                        and url <= ord('w')) or (url >= ord('0')
                                                 and url <= ord('9')):
                    str_url = e[str(URLKey)]
                    s = s + str_url
                else:
                    s = s + URLKey

            hash_url = hashlib.md5(s).hexdigest()[8:-8]

            item['linkmd5id'] = hash_url

            print "---> hash_url: " + hash_url
            print "---> url_orgin: " + s

            #图片下载本地地址
            # path = "/Users/chenxingwang/Desktop/"+ category +"/"+hash_url[:2]+"/"+hash_url
            uploadUrl = "/" + category + "/" + hash_url[:2] + "/" + hash_url
            path = "../output/gif" + uploadUrl

            print "---> folder for saving image: " + path

            isExists = os.path.exists(path)

            if not isExists:
                os.makedirs(path)

            origin_filename = "origin.gif"
            static_filename = "static.jpg"
            thumb_filename = "thumb.gif"
            detail_filename = "detail.gif"

            url_orgin = s
            url_thumb = MyData[i]["thumbURL"] + ".gif"
            url_hover = MyData[i]["hoverURL"]
            url_middle = MyData[i]["middleURL"] + ".gif"
            item['fromURL'] = s_f
            item['objURL'] = uploadUrl + "/" + origin_filename
            item['hoverURL'] = uploadUrl + "/" + static_filename
            item['thumbURL'] = uploadUrl + "/" + thumb_filename
            item['middleURL'] = uploadUrl + "/" + detail_filename
            item['filesize'] = 0
            item['frame'] = 0
            yield item
            sleep(1)
            # items.append(item)

            # STEP 1. 下载缩略图
            print "---> STEP 1. begin download thumb image: " + url_thumb
            req = urllib2.Request(
                url_thumb,
                headers={
                    "Upgrade-Insecure-Requests": "1",
                    "X-DevTools-Emulate-Network-Conditions-Client-Id":
                    "7A55439C-E6CF-420D-B294-7635B17E648B",
                    "User-Agent":
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
                    "Accept":
                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                    "Accept-Encoding": "gzip, deflate, sdch",
                    "Accept-Language": "zh-CN,zh;q=0.8"
                })

            try:
                # print "thumb image to download ----------->" + url_thumb

                img_path = path + "/" + thumb_filename
                imgData = urllib2.urlopen(req).read()
                f = file(img_path, "wb")
                f.write(imgData)
                f.close()

                # generate static.jpg
                Image.open(img_path).convert('RGB').save(path + "/" +
                                                         static_filename)

                print('------> thumb image downloaded. saved to ' + img_path)

            except IOError as err:
                print("------> IO error:{0}".format(err))
            except:
                print '------> download thumb image occured some error'
                print("------> Unexpected error:", sys.exc_info())

            file_name = path + "/" + origin_filename

            # STEP 2. 下载原图
            print "---> STEP 2. begin download origin image: " + url_orgin
            try:
                # 2 / 0
                urllib.urlretrieve(url_orgin, '%s' % (file_name))
                print('------> origin image downloaded. saved to ' + file_name)
            except IOError as err:
                print("------> IO error:{0}".format(err))
            except:
                print("------> Unexpected error:", sys.exc_info())
                print '------> download origin image occurred some error, skip this item. !!!!!!!!!!!!!!!!!!!'
                item[
                    'customer_exceptions'] = 'download origin image occurred some error'
                yield item

            # STEP 3. 生成详情图:detail.gif
            print "---> STEP 3. begin generate detail image"
            try:
                # extract fileinfo of origin.gif
                im = Image.open(file_name)
                # in KB
                origin_size = os.stat(file_name).st_size / 1024
                item['filesize'] = str(origin_size)

                origin_frame_count = 1
                try:
                    while 1:
                        im.seek(im.tell() + 1)
                        origin_frame_count = origin_frame_count + 1
                except EOFError:
                    pass  # end of sequence

                item['frame'] = str(origin_frame_count)

                print "------> origin image info : size-" + str(
                    origin_size) + "KB, frames-" + str(origin_frame_count)

                # generate detail.gif
                origin_size_threshold = 1.5
                if origin_size > origin_size_threshold * 1024:
                    print "------> origin image is bigger than " + str(
                        origin_size_threshold) + "M"
                    im = Image.open(file_name)
                    tmp_path = path + "/temp/"
                    if not os.path.exists(tmp_path):
                        os.makedirs(tmp_path)

                    print '------> origin file info: ' + str(im.info)

                    if 'duration' in im.info.keys():
                        origin_duration = im.info['duration'] / 1000.00
                    else:
                        origin_duration = 0

                    temp_filenames = []
                    # index = 1

                    reader = imageio.get_reader(file_name)
                    for i, tmp_im in enumerate(reader):
                        imageio.imwrite("%sframe%d.png" % (tmp_path, i),
                                        tmp_im)
                        temp_filenames.append("%sframe%d.png" % (tmp_path, i))

                    # for frame in ImageSequence.Iterator(im):
                    #     frame.save("%sframe%d.png" % (tmp_path, index))
                    #     temp_filenames.append("%sframe%d.png" % (tmp_path, index))
                    #     index += 1

                    # print temp_filenames

                    with imageio.get_writer(
                            path + "/" + detail_filename,
                            mode='I',
                            duration=origin_duration) as writer:
                        for temp_filename in temp_filenames:
                            tmp_im = Image.open(temp_filename)
                            tmp_im.thumbnail((230, 230))
                            tmp_im.save(temp_filename)
                            image = imageio.imread(temp_filename)
                            writer.append_data(image)

                    shutil.rmtree(tmp_path)
                    print '------> end: generate detail.gif'
                else:
                    print "------> copy origin.gif as detail.gif"
                    shutil.copyfile(file_name, path + "/" + detail_filename)

                # a = 2 / 0

            except IOError as err:
                print("------> IO error:{0}".format(err))
            except:
                print("------> Unexpected error:", sys.exc_info())
                print '------> generate detail.gif  occured some error'

            print "---> finished data parse : " + hash_url

            yield item

            sleep(1)
Exemplo n.º 26
0
    def parse(self, response, typeid, category):

        print "---> begin execute parse method"

        item = BaiduItem()
        items = []
        #        print response.body

        print "---> tags: " + typeid

        data = response.body
        # print data

        print "---> category: " + category

        s = json.loads(data)

        #        print s

        MyData = s["data"]

        #        print "Data++++++++"

        #        print data

        num = len(MyData)

        for i in xrange(0, num - 1):

            #            print MyData[i]

            print '\n--------------------------------------------------------------------------%d---------------------------------------------------\n' % (
                i)

            print '---> now is : ' + str(
                time.strftime('%Y-%m-%d %H:%M %S', time.localtime(
                    time.time())))

            # print MyData[i]["hoverURL"]

            width = MyData[i]["width"]
            height = MyData[i]["height"]

            size = str(width) + "*" + str(height)

            print '---> title: ' + MyData[i]["fromPageTitleEnc"]

            #测试下载使用gif地址
            # image = str(MyData[i]["middleURL"])
            # print "image------->" + image

            simid = MyData[i]["os"]
            Hash = simid.split(",")[0]

            #            now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
            now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

            item['imgsize'] = size
            item['imgid'] = Hash
            item['category'] = category
            item['tag'] = typeid
            item['updateTime'] = now
            item['scrawl_time'] = now
            item['title'] = MyData[i]["fromPageTitleEnc"]
            item['fromURLHost'] = MyData[i]["fromURLHost"]
            item['author'] = MyData[i]["fromURLHost"]

            ObjURL = MyData[i]["objURL"]

            fromURL = MyData[i]["fromURL"]

            e = {
                "w": "a",
                "k": "b",
                "v": "c",
                "1": "d",
                "j": "e",
                "u": "f",
                "2": "g",
                "i": "h",
                "t": "i",
                "3": "j",
                "h": "k",
                "s": "l",
                "4": "m",
                "g": "n",
                "5": "o",
                "r": "p",
                "q": "q",
                "6": "r",
                "f": "s",
                "p": "t",
                "7": "u",
                "e": "v",
                "o": "w",
                "8": "1",
                "d": "2",
                "n": "3",
                "9": "4",
                "c": "5",
                "m": "6",
                "0": "7",
                "b": "8",
                "l": "9",
                "a": "0",
                "_z2C$q": ":",
                "_z&e3B": ".",
                "AzdH3F": "/"
            }

            #破译obj编码 使用字符替换 没有使用正则(正则学的太烂) TODO:正则

            print ObjURL

            ekeys = e.keys()

            #            print ekeys

            strLength = len(ekeys)

            URL_O = ObjURL
            URL_O = URL_O.replace("_z2C$q", ":")
            URL_O = URL_O.replace("_z&e3B", ".")
            URL_O = URL_O.replace("AzdH3F", "/")
            URLLength = len(URL_O)
            print URL_O

            URL_F = fromURL
            URL_F = URL_F.replace("_z2C$q", ":")
            URL_F = URL_F.replace("_z&e3B", ".")
            URL_F = URL_F.replace("AzdH3F", "/")
            URL_FLength = len(URL_F)
            s_f = ""

            #解析fromURL TODO
            for j in xrange(0, URL_FLength):
                URLKey = URL_F[j]
                url = ord(URLKey)
                if (url >= ord('a')
                        and url <= ord('w')) or (url >= ord('0')
                                                 and url <= ord('9')):
                    str_url = e[str(URLKey)]
                    s_f = s_f + str_url
                else:
                    s_f = s_f + URLKey
            print s_f

            s = ""
            for j in xrange(0, URLLength):
                URLKey = URL_O[j]
                url = ord(URLKey)
                if (url >= ord('a')
                        and url <= ord('w')) or (url >= ord('0')
                                                 and url <= ord('9')):
                    str_url = e[str(URLKey)]
                    s = s + str_url
                else:
                    s = s + URLKey

            hash_url = hashlib.md5(s).hexdigest()[8:-8]

            item['linkmd5id'] = hash_url

            print "---> hash_url: " + hash_url
            print "---> url_orgin: " + s

            #图片下载本地地址
            # path = "/Users/chenxingwang/Desktop/"+ category +"/"+hash_url[:2]+"/"+hash_url
            uploadUrl = "/" + category + "/" + hash_url[:2] + "/" + hash_url
            path = "../output/gif" + uploadUrl

            print "---> folder for saving image: " + path

            # isExists=os.path.exists(path)

            # if not isExists:
            #     os.makedirs(path)

            origin_filename = "origin.gif"
            static_filename = "static.jpg"
            thumb_filename = "thumb.gif"
            detail_filename = "detail.gif"

            url_orgin = s
            url_thumb = MyData[i]["thumbURL"] + ".gif"
            url_hover = MyData[i]["hoverURL"]
            url_middle = MyData[i]["middleURL"] + ".gif"
            item['fromURL'] = s_f
            item['objURL'] = uploadUrl + "/" + origin_filename
            item['hoverURL'] = uploadUrl + "/" + static_filename
            item['thumbURL'] = uploadUrl + "/" + thumb_filename
            item['middleURL'] = uploadUrl + "/" + detail_filename
            item['filesize'] = 0
            item['frame'] = 0
            item['source_thumb_url'] = url_thumb
            item['source_original_url'] = s
            sleep(1)
            yield item