Python getMatchList示例，tools.Pattern.getMatchList Python示例

示例#1

0

显示文件

    def groups(self):
        cr_url = 'http://weibo.com/p/100505%s/myfollow?pids=Pl_Official_RelationGroupList__96&relate=group' \
                 '&Pl_Official_RelationGroupList__96_page=1#Pl_Official_RelationGroupList__96'
        json_list = []

        comment_url = cr_url % self.uid
        list_data = []
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url, headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace('\\/', '/')
                    break
                except Exception, e:
                #html = ''
                    print "Network Exception!!! ", e
                    continue
            #finally:
            datas = getMatchList(html, '<div class="mod_info">.*?</p>')
                # print len(datas)
                # r_datas = datas.reverse()
            list_data.append(datas)

                # 分页
            next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页')
                # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            else:
                break

示例#2

0

显示文件

    def commentInbox(self):
        cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    print html
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    time.sleep(5)
                    continue
            #finally:
            datas = getMatchList(
                html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
            # print len(datas)

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, 'page_frame" title="(*)"')
                mid = getMatch(data, '&cid=(*)&')
                timestamp = getMatch(data,
                                     '<div class="WB_from S_txt2">(*)  来自')
                if timestamp:
                    timestamp = long(getTimeStamp(timestamp))
                else:
                    timestamp = 0

                if timestamp <= self.mlasttime:
                    tags = True
                    break

                text = getMatch(data, '<div class="WB_text">(*)</div>')
                if text:
                    text = extractForHTML(text)
                else:
                    text = ''
                r_mid = getMatch(data, 'mid=(*)&')
                r_uid = self.uid
                #commet_type = 'make'
                commet_type = 'receive'

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'comment_type': commet_type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

            # 分页
            next_pageUrl = getUrlToPattern(html,
                                           comment_url,
                                           pattern='page',
                                           text_pattern='下一页')
            # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            elif not next_pageUrl or tags:
                break

示例#3

0

显示文件

    def atMeMicroBlog(self):
        pre_page = 0
        page = 1
        pagebar = 0
        # max_page = 100
        at_MBurl = 'http://weibo.com/aj/at/mblog/list?ajwvr=6&pre_page=%s&page=%s' \
                   '&filter_by_author=0&filter_by_type=0&is_adv=0&pagebar=%s'
        print at_MBurl
        json_list = []
        tags = False

        while True:
            wbUrl = at_MBurl % (pre_page, page, pagebar)
            print "current url: ", wbUrl
            while True:
                try:
                    request = urllib2.Request(wbUrl, headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    mb_content = json.loads(response.read())
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
            # finally:
            html = mb_content["data"]
            print "html****html****html****html****html****", html
            # 分页
            print "html_replace***html_replace***html_replace***", html.replace(
                '\n', '').replace(' ', '')
            print len(html.replace('\n', '').replace(' ', ''))
            print tags
            if html.replace('\n', '').replace(' ', '') == '' or tags:
                break
                # if page > max_page:
                #     break
            elif pre_page < page:
                pre_page += 1
            elif pre_page == page and pagebar == 0:
                pagebar = 1
            elif pagebar == 1:
                pre_page = page
                page += 1
                pagebar = 0

            datas = getMatchList(
                html,
                '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)&')
                nickname = getMatch(data, 'nick-name="(*)"')
                mid = getMatch(data, 'pubuser_nick:(*)"')
                timestamp = getMatch(
                    data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3]
                if timestamp and timestamp.isdigit():
                    timestamp = long(timestamp)
                else:
                    timestamp = 0

                if timestamp <= self.lasttime:
                    tags = True
                    break

                text = getMatch(data, 'feed_list_content" >(*)</div>').strip()
                if text:
                    text = extractForHTML(text.strip())
                else:
                    text = ''

                retweet = getMatch(
                    data,
                    'forward_btn_text">.*?<em>(*)</em>').replace('转发', '')
                if retweet and retweet.isdigit():
                    retweet = long(retweet)
                else:
                    retweet = 0

                comment = getMatch(
                    data,
                    'comment_btn_text">.*?<em>(*)</em>').replace('评论', '')
                if comment and comment.isdigit():
                    comment = long(comment)
                else:
                    comment = 0

                like = getMatch(data, 'UI_ani_praised".*?<em>(*)</em>')
                if like and like.isdigit():
                    like = long(like)
                else:
                    like = 0
                r_mid = getMatch(data, 'rootmid=(*)&')
                r_uid = self.uid

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'retweet': retweet,
                    'comment': comment,
                    'like': like,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

示例#4

0

显示文件

文件： weibo_feedback_like.py 项目： feifanhanmc/xnr2

        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            try:
                request = urllib2.Request(comment_url, headers=self._headers)
                response = urllib2.urlopen(request, timeout=60)
                html = response.read().decode('string_escape').replace(
                    '\\/', '/')
            except Exception, e:
                print "Network Exception!!! ", e
            finally:
                datas = getMatchList(
                    html,
                    '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
                # print len(datas)

                for data in datas:
                    photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, 'usercard=.*?alt="(*)"')
                    mid = getMatch(data, '&cid=(*)&')
                    timestamp = getMatch(
                        data, '<div class="WB_from S_txt2">(*)  来自')
                    if timestamp:
                        timestamp = long(getTimeStamp(timestamp))
                    else:
                        timestamp = 0

示例#5

0

显示文件

                 '&cfs=&Pl_Official_RelationMyfollow__93_page=1#Pl_Official_RelationMyfollow__93'
        json_list = []

        comment_url = cr_url % self.uid
        list_data = []
        while True:
            print "comment_url**comment_url**comment_url**comment_url**", comment_url
            try:
                request = urllib2.Request(comment_url, headers=self._headers)
                response = urllib2.urlopen(request, timeout=60)
                html = response.read().decode('string_escape').replace(
                    '\\/', '/')
            except Exception, e:
                print "Network Exception!!! ", e
            finally:
                datas = getMatchList(html,
                                     '<li class="member_li S_bg1".*?</li>')
                # print len(datas)
                # r_datas = datas.reverse()
                list_data.append(datas)

                # 分页
                next_pageUrl = getUrlToPattern(html,
                                               comment_url,
                                               pattern='page',
                                               text_pattern='下一页')
                print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**", next_pageUrl
                if next_pageUrl:
                    comment_url = next_pageUrl[0]
                else:
                    break

示例#6

0

显示文件

文件： weibo_feedback_private.py 项目： feifanhanmc/xnr2

        comment_url = cr_url
        while True:
            print comment_url
            try:
                request = urllib2.Request(comment_url, headers=self._headers)
                response = urllib2.urlopen(request, timeout=60)
                html = response.read().decode('string_escape').replace(
                    '\\/', '/')
            except Exception, e:
                print "Network Exception!!! ", e
                #print 'html:', html

            finally:
                datas = getMatchList(
                    html,
                    '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->'
                )

                for data in datas:
                    photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, '<img.*?alt="(*)"')
                    r_uid = self.uid

                    counts = getMatch(
                        data, '<em class="W_new_count S_spetxt_bg">(*)</em>')
                    if counts and counts.isdigit():
                        counts = long(counts)
                    else:
                        counts = 0
                    _type = 'stranger'

示例#7

0

显示文件

        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            try:
                request = urllib2.Request(comment_url, headers=self._headers)
                response = urllib2.urlopen(request, timeout=60)
                html = response.read().decode('string_escape').replace(
                    '\\/', '/')
            except Exception, e:
                print "Network Exception!!! ", e
            finally:
                datas = getMatchList(
                    html,
                    '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
                # print len(datas)

                for data in datas:
                    photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, 'page_frame" title="(*)"')
                    mid = getMatch(data, '&cid=(*)&')
                    timestamp = getMatch(
                        data, '<div class="WB_from S_txt2">(*)  来自')
                    if timestamp:
                        timestamp = long(getTimeStamp(timestamp))
                    else:
                        timestamp = 0

示例#8

0

显示文件

    def messages(self):
        cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1'
        de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
                #print 'html:', html

            #finally:
            datas = getMatchList(
                html,
                '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, '<img.*?alt="(*)"')
                r_uid = self.uid

                counts = getMatch(
                    data, '<em class="W_new_count S_spetxt_bg">(*)</em>')
                if counts and counts.isdigit():
                    counts = long(counts)
                else:
                    counts = 0
                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'
                    while True:
                        try:
                            detailUrl = de_url % (uid, int(time.time() * 1000))
                            #print 'detail_url:', detailUrl
                            request = urllib2.Request(detailUrl)
                            response = urllib2.urlopen(request, timeout=60)
                            ms_content = json.loads(response.read())
                            break
                        except Exception, e:
                            print "Network Exception!!! ", e
                            continue
                #else:
                    html = ms_content["data"]["html"]
                    ms_datas = getMatchList(
                        html,
                        u'(<!-- 单行文字-->|<div class="space">).*?<!--／附件信息-->')
                    # print datas[0]
                    last_time = 0
                    for ms_data in ms_datas:
                        mid_uid = getMatch(ms_data, 'usercard="id=(*)"')
                        mid = getMatch(ms_data, 'mid="(*)"')
                        timestamp = getMatch(
                            ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>')
                        #soup = BeautifulSoup(ms_data)
                        #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"])
                        if timestamp:
                            timestamp = long(getTimeStamp(timestamp))
                            last_time = timestamp
                        else:
                            timestamp = last_time

                        if timestamp < self.lasttime:
                            tags = True
                            print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime
                            #break
                            next

                        text = getMatch(ms_data,
                                        u'<div class="cont">.*?<!--／附件信息-->')
                        if text:
                            text = extractForHTML(text)
                            text = commentExtract(text)

                        if mid_uid == uid:
                            private_type = 'receive'
                        elif mid_uid == r_uid:
                            private_type = 'make'
                        else:
                            private_type = ''

                        wb_item = {
                            'photo_url': photo_url,
                            'uid': uid,
                            'nick_name': nickname,
                            'mid': mid,
                            'timestamp': timestamp,
                            'text': text,
                            'root_uid': r_uid,
                            'weibo_type': _type,
                            'private_type': private_type,
                            'w_new_count': counts,
                            'update_time': self.update_time
                        }

                        wb_json = json.dumps(wb_item)
                        #print 'wb_json:::',wb_json
                        json_list.append(wb_json)

示例#9

0

显示文件

    def follow(self):
        cr_url = 'http://weibo.com/p/100505%s/myfollow?t=1&pids=Pl_Official_RelationMyfollow__93' \
                 '&cfs=&Pl_Official_RelationMyfollow__93_page=1#Pl_Official_RelationMyfollow__93'
        json_list = []

        comment_url = cr_url % self.uid
        list_data = []
        while True:
            print "comment_url**comment_url**comment_url**comment_url**", comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url, headers=self._headers)
                    print 1111111111111111
                    response = urllib2.urlopen(request, timeout=60)
                    print 2222222222222222
                    html = response.read().decode('string_escape').replace('\\/', '/')
                    print 3333333333333333
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
            #finally:
            datas = getMatchList(html, '<li class="member_li S_bg1".*?</li>')
                # print len(datas)
                # r_datas = datas.reverse()
            list_data.append(datas)

                # 分页
            next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页')
            print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**",next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            else:
                break

            r_list_data = reversed(list_data)
            for l_datas in r_list_data:
                r_datas = reversed(l_datas)
                for data in r_datas:
                    #print 'data::',data
                    photo_url = getMatch(data, 'profile_image_url=(*)&')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, '<img.*?alt="(*)"')
                    timestamp = int(round(time.time()))
                    time.sleep(1)

                    sex = getMatch(data, '&sex=(*)"')
                    if not sex:
                        sex = ''
                    elif sex == 'f':
                        sex = 'female'
                    elif sex == 'm':
                        sex = 'male'

                    follow_source = getMatch(data, 'class="S_link2" >(*)</a>')
                    if not follow_source:
                        follow_source = ''

                    description = getMatch(data, 'W_autocut S_txt2">(*)</div>')
                    if not description:
                        description = ''

                    gid = getMatch(data, '&gid=(*)&')
                    if not gid:
                        gid = '0'
                    gname = getMatch(data, '&gname=(*)&')
                    if not gname:
                        gname = ''

                    r_uid = self.uid
                    _type = 'follow'

                    #获得关注人的详细信息
                    #user = SinaOperateAPI().getUserShow(uid=uid)

                    wb_item = {
                        'photo_url': photo_url,
                        'uid': uid,
                        'mid': uid,
                        'nick_name': nickname,
                        'timestamp': timestamp,
                        'sex': sex,
                        'description': description,
                        'follow_source': follow_source,
                        'gid': gid,
                        'gname': gname,
                        'root_uid': r_uid,
                        'weibo_type': _type,
                        'update_time': self.update_time
                    }
                    if wb_item['mid'] == None:
                        wb_item['mid'] = ''
                    print "follow, mid", wb_item['mid']
                    print "follow, root_uid", wb_item['root_uid']
                    wb_json = json.dumps(wb_item)
                    # print wb_json
                    json_list.append(wb_json)

示例#10

0

显示文件

                # 分页
                if html.replace('\n', '') == '' or tags:
                    break
                # if page > max_page:
                #     break
                elif pre_page < page:
                    pre_page += 1
                elif pre_page == page and pagebar == 0:
                    pagebar = 1
                elif pagebar == 1:
                    pre_page = page
                    page += 1
                    pagebar = 0

                datas = getMatchList(html, '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat')

                for data in datas:
                    photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                    uid = getMatch(data, 'usercard="id=(*)&')
                    nickname = getMatch(data, 'nick-name="(*)"')
                    mid = getMatch(data, 'pubuser_nick:(*)"')
                    timestamp = getMatch(data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3]
                    if timestamp and timestamp.isdigit():
                        timestamp = long(timestamp)
                    else:
                        timestamp = 0
                    
                    if timestamp <= self.lasttime:
                        tags = True
                        break