Exemplo n.º 1
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 *
                       10)  #         self.logger.debug('load %s finish' % url)

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)  # self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid

            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
                continue  #认为已经爬过了
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            #print(u'微博内容:'+mblog.content)
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])

            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo

            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > fetch_n_comments:  #只抓取评论数多于规定条数的微博
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            mblog.save()

        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return

        yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
Exemplo n.º 2
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        params = urldecode(url)
        try:
            br = self.opener.browse_open(url)
        except URLError:
            raise FetchBannedError()

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()

        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page

        try:
            data = json.loads(br.response().read())['data']
        except (ValueError, KeyError):
            raise FetchBannedError('fetch banned by weibo server')
        soup = beautiful_soup(data)
        finished = False

        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True)
        max_id = None
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            blog_create_date = parse(
                div.select('a.S_link2.WB_time')[0]['title'])
            # skip all following blogs if create date less than effective start date
            if (blog_create_date - effective_start_date).days < 0:
                self.logger.info(
                    "%s: blog has sync up after %s" %
                    (self.uid, effective_start_date.strftime("%Y%m%d")))
                finished = True
                break

            if 'end_id' not in params:
                params['end_id'] = mid
            # skip
            #if weibo_user.newest_mids and not mid in weibo_user.newest_mids:
            #    self.logger.info("%s: reach earliest blog %s" % (self.uid,mid))
            #    finished = True
            #    break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)

            try:
                mblog = getattr(MicroBlog,
                                'objects').get(Q(mid=mid) & Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div',
                                   attrs={
                                       'class': 'WB_text',
                                       'node-type': 'feed_list_content'
                                   })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title'])
            mblog.content = content_div.text
            is_forward = div.get('isforward')
            if is_forward:
                # write origional user, msg
                mblog.omid = div['omid']
                tbinfos = div['tbinfo'].split('&')
                mblog.ouid = tbinfos[0].split('=')[1]
                name_a = div.find('a',
                                  attrs={
                                      'class': 'WB_name',
                                      'node-type': 'feed_list_originNick'
                                  })
                text_a = div.find('div',
                                  attrs={
                                      'class': 'WB_text',
                                      'node-type': 'feed_list_reason'
                                  })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (name_a.text, text_a.text)
            mblog.created = blog_create_date
            mblog.last_update = datetime.now()

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

            likes = func_div.find('a',
                                  attrs={
                                      'action-type': action_type_re("like")
                                  }).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re("forward")
                                     }).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(',
                                                              1)[1].strip(')'))
            comments = func_div.find('a',
                                     attrs={
                                         'action-type':
                                         action_type_re('comment')
                                     }).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(',
                                                              1)[1].strip(')'))

            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?" +
                                     map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple(
                    [float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            # has_video
            div_video = div.find(
                'div', attrs={'node-type': 'fl_h5_video_disp'}) or div.find(
                    'span', attrs={'class': 'icon_playvideo'})
            mblog.has_video = True if div_video else False
            mblog.save()
            self.counter.inc('processed_weibo_posts', 1)

            # fetch forwards and comments
            if self.uid in starts:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    yield forward_url
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    yield comment_url
                if fetch_like and mblog.n_likes > 0:
                    query = {
                        'mid': mid,
                        '_t': 0,
                        '__rnd': int(time.time() * 1000)
                    }
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    yield like_url

            yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))

        if params.has_key('pagebar'):
            params['max_id'] = max_id
        elif params.has_key('max_id'):
            del params['max_id']


#         self.logger.debug('parse %s finish' % url)

# counter add one for the processed weibo list url
        self.counter.inc('processed_weibo_list_page', 1)

        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return
Exemplo n.º 3
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        params = urldecode(url)
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        
        if not self.check(url, br):
            return [], []
            
        weibo_user = self.get_weibo_user()
        
        params['_t'] = 0
        params['__rnd'] = str(int(time.time() * 1000))
        page = int(params.get('page', 1))
        pre_page = int(params.get('pre_page', 0))
        count = 15
        if 'pagebar' not in params:
            params['pagebar'] = '0'
            pre_page += 1
        elif params['pagebar'] == '0':
            params['pagebar'] = '1'
        elif params['pagebar'] == '1':
            del params['pagebar']
            pre_page = page
            page += 1
            count = 50
        params['count'] = count
        params['page'] = page
        params['pre_page'] = pre_page
        
        data = json.loads(br.response().read())['data']
        soup = beautiful_soup(data)
        finished = False
        
        divs = soup.find_all('div', attrs={'class': 'WB_feed_type'},  mid=True)
        max_id = None
        next_urls = []
        for div in divs:
            mid = div['mid']
            if len(mid) == 0:
                continue
            max_id = mid
            
            if 'end_id' not in params:
                params['end_id'] = mid
            if mid in weibo_user.newest_mids:
                finished = True
                break
            if len(self.bundle.newest_mids) < 3:
                self.bundle.newest_mids.append(mid)
            
            try:
                mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid))
            except DoesNotExist:
                mblog = MicroBlog(mid=mid, uid=self.uid)
            content_div = div.find('div', attrs={
                'class': 'WB_text', 
                'node-type': 'feed_list_content'
            })
            for img in content_div.find_all("img", attrs={'type': 'face'}):
                img.replace_with(img['title']);
            mblog.content = content_div.text
            is_forward = div.get('isforward') == '1'
            if is_forward:
                mblog.omid = div['omid']
                name_a = div.find('a', attrs={
                    'class': 'WB_name', 
                    'node-type': 'feed_list_originNick'
                })
                text_a = div.find('div', attrs={
                    'class': 'WB_text',
                    'node-type': 'feed_list_reason'
                })
                if name_a is not None and text_a is not None:
                    mblog.forward = '%s: %s' % (
                        name_a.text,
                        text_a.text
                    )
            #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title'])
            #ci
            #
            temp = parse(div.select('a.S_link2.WB_time')[0]['title'])
            tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S")
            list=tempstring.split('-')
            tempyear=list[0]
            tempmonth=list[1]
            tempday=list[2]
            temphour=list[3]
            tempmin=list[4]
            tempsec=list[5]
            temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple())
            print temptime
            
            timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r")
            time_re=timevalue.readline()
            timevalue.close()
            list=time_re.split()
            starttime=list[0]
            endtime=list[1]
            print starttime
            temptime=round(float(temptime))
            starttime=round(float(starttime))
            endtime=round(float(endtime))
            if temptime>=starttime and temptime<=endtime:
                mblog.created = temp
                #timeok = True
                print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----"
            else:
                if temptime<starttime:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    time.sleep(5)
                    return [], []
                #continue
            #
            # 
            if self.bundle.last_update is None or mblog.created > self.bundle.last_update:
                self.bundle.last_update = mblog.created
            if weibo_user.last_update is not None and \
                mblog.created <= weibo_user.last_update:
                finished = True
                break

            func_div = div.find_all('div', 'WB_func')[-1]
            action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)
            
            likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text
            likes = likes.strip('(').strip(')')
            likes = 0 if len(likes) == 0 else int(likes)
            mblog.n_likes = likes
            forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text
            if '(' not in forwards:
                mblog.n_forwards = 0
            else:
                mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')'))
            comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text
            if '(' not in comments:
                mblog.n_comments = 0
            else:
                mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')'))
                
            # fetch geo info
            map_info = div.find("div", attrs={'class': 'map_data'})
            if map_info is not None:
                geo = Geo()
                geo.location = map_info.text.split('-')[0].strip()
                geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo']
                geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)])
                mblog.geo = geo
            
            # fetch forwards and comments
            if fetch_forward or fetch_comment or fetch_like:
                query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                query_str = urllib.urlencode(query)
                if fetch_forward and mblog.n_forwards > 0:
                    forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str
                    next_urls.append(forward_url)
                if fetch_comment and mblog.n_comments > 0:
                    comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str
                    next_urls.append(comment_url)
                if fetch_like and mblog.n_likes > 0:
                    query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)}
                    query_str = urllib.urlencode(query)
                    like_url = 'http://weibo.com/aj/like/big?%s' % query_str
                    next_urls.append(like_url)
            
            mblog.save()
        
        if 'pagebar' in params:
            params['max_id'] = max_id
        else:
            del params['max_id']
        self.logger.debug('parse %s finish' % url)
                
        # if not has next page
        if len(divs) == 0 or finished:
            weibo_user = self.get_weibo_user()
            for mid in self.bundle.newest_mids:
                if mid not in weibo_user.newest_mids:
                    weibo_user.newest_mids.append(mid)
            while len(weibo_user.newest_mids) > 3:
                weibo_user.newest_mids.pop()
            weibo_user.last_update = self.bundle.last_update
            weibo_user.save()
            return [], []
        
        next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params)))
        return next_urls, []
Exemplo n.º 4
0
    def save_blog_detail(self, div, mid):
        try:
            mblog = getattr(MicroBlog,
                            'objects').get(Q(mid=mid) & Q(uid=self.uid))
        except DoesNotExist:
            mblog = MicroBlog(mid=mid, uid=self.uid)
        content_div = div.find('div',
                               attrs={
                                   'class': 'WB_text',
                                   'node-type': 'feed_list_content'
                               })
        blog_create_date = parse(
            div.find('a', attrs={'node-type': 'feed_list_item_date'})['title'])

        for img in content_div.find_all("img", attrs={'type': 'face'}):
            img.replace_with(img['title'])
        mblog.content = content_div.text
        is_forward = div.get('isforward')
        if is_forward:
            # write origional user, msg
            mblog.omid = div['omid']
            tbinfos = div['tbinfo'].split('&')
            mblog.ouid = tbinfos[0].split('=')[1]
            name_a = div.find('a',
                              attrs={
                                  'class': 'WB_name',
                                  'node-type': 'feed_list_originNick'
                              })
            text_a = div.find('div',
                              attrs={
                                  'class': 'WB_text',
                                  'node-type': 'feed_list_reason'
                              })
            if name_a is not None and text_a is not None:
                mblog.forward = '%s: %s' % (name_a.text, text_a.text)
        mblog.created = blog_create_date
        mblog.last_update = datetime.now()

        func_div = div.find_all('div',
                                attrs={'node-type': 'feed_list_options'})[-1]
        action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t)

        likes = func_div.find('a',
                              attrs={
                                  'action-type': action_type_re("like")
                              }).find_all('em')[1].text
        likes = likes.strip('(').strip(')').replace(',', '')
        likes = int(likes) if likes and unicode.isdigit(likes) else 0
        mblog.n_likes = likes
        forwards = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re("forward")
                                 }).find_all('em')[1].text
        forwards = forwards.strip('(').strip(')').replace(',', '')
        mblog.n_forwards = int(
            forwards) if forwards and unicode.isdigit(forwards) else 0
        comments = func_div.find('a',
                                 attrs={
                                     'action-type': action_type_re('comment')
                                 }).find_all('em')[1].text
        comments = comments.strip('(').strip(')').replace(',', '')
        mblog.n_comments = int(
            comments) if comments and unicode.isdigit(comments) else 0

        # fetch geo info
        map_info = div.find("div", attrs={'class': 'map_data'})
        if map_info is not None:
            geo = Geo()
            geo.location = map_info.text.split('-')[0].strip()
            geo_info = urldecode("?" +
                                 map_info.find('a')['action-data'])['geo']
            geo.longtitude, geo.latitude = tuple(
                [float(itm) for itm in geo_info.split(',', 1)])
            mblog.geo = geo
        # has_video
        div_video = div.find('div', attrs={
            'node-type': 'fl_h5_video_disp'
        }) or div.find('span', attrs={'class': 'icon_playvideo'})
        mblog.has_video = True if div_video else False
        mblog.save()
        return mblog