Python getUrlToPattern示例

编程语言: Python

命名空间/包名称: tools.URLTools

方法/功能: getUrlToPattern

hotexamples.com的示例: 5

Python getUrlToPattern - 已找到5个示例。这些是从开源项目中提取的最受好评的tools.URLTools.getUrlToPattern现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

    def groups(self):
        cr_url = 'http://weibo.com/p/100505%s/myfollow?pids=Pl_Official_RelationGroupList__96&relate=group' \
                 '&Pl_Official_RelationGroupList__96_page=1#Pl_Official_RelationGroupList__96'
        json_list = []

        comment_url = cr_url % self.uid
        list_data = []
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url, headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace('\\/', '/')
                    break
                except Exception, e:
                #html = ''
                    print "Network Exception!!! ", e
                    continue
            #finally:
            datas = getMatchList(html, '<div class="mod_info">.*?</p>')
                # print len(datas)
                # r_datas = datas.reverse()
            list_data.append(datas)

                # 分页
            next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页')
                # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            else:
                break

示例#2

显示文件

    def commentInbox(self):
        cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    print html
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    time.sleep(5)
                    continue
            #finally:
            datas = getMatchList(
                html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
            # print len(datas)

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, 'page_frame" title="(*)"')
                mid = getMatch(data, '&cid=(*)&')
                timestamp = getMatch(data,
                                     '<div class="WB_from S_txt2">(*)  来自')
                if timestamp:
                    timestamp = long(getTimeStamp(timestamp))
                else:
                    timestamp = 0

                if timestamp <= self.mlasttime:
                    tags = True
                    break

                text = getMatch(data, '<div class="WB_text">(*)</div>')
                if text:
                    text = extractForHTML(text)
                else:
                    text = ''
                r_mid = getMatch(data, 'mid=(*)&')
                r_uid = self.uid
                #commet_type = 'make'
                commet_type = 'receive'

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'comment_type': commet_type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

            # 分页
            next_pageUrl = getUrlToPattern(html,
                                           comment_url,
                                           pattern='page',
                                           text_pattern='下一页')
            # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            elif not next_pageUrl or tags:
                break

示例#3

显示文件

文件： weibo_feedback_like.py 项目： feifanhanmc/xnr2

                        'nick_name': nickname,
                        'mid': mid,
                        'timestamp': timestamp,
                        'text': text,
                        'root_mid': r_mid,
                        'root_uid': r_uid,
                        'weibo_type': _type,
                        'update_time': self.update_time
                    }

                    wb_json = json.dumps(wb_item)
                    json_list.append(wb_json)

                # 分页
                next_pageUrl = getUrlToPattern(html,
                                               comment_url,
                                               pattern='page',
                                               text_pattern='下一页')
                # print next_pageUrl
                if next_pageUrl:
                    comment_url = next_pageUrl[0]
                elif not next_pageUrl or tags:
                    break
        return json_list

    def execute(self):
        likes = self.likeInbox()

        executeES('weibo_feedback_like', 'text', likes)


if __name__ == '__main__':

示例#4

显示文件

class FeedbackPrivate:
    def __init__(self, uid, current_ts, fans, follow, groups, lastTime):
        self.uid = uid
        self.follow = follow
        self.fans = fans
        self.groups = groups
        self.update_time = current_ts
        self.lasttime = lastTime

        self._headers = {
            "Headers":
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2;"
            " .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0;"
            " .NET4.0C; .NET4.0E; InfoPath.3)",
            "Referer":
            "http://weibo.com/u/%s/home?topnav=1&wvr=6" % self.uid
        }

    def messages(self):
        cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1'
        de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
                #print 'html:', html

            #finally:
            datas = getMatchList(
                html,
                '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, '<img.*?alt="(*)"')
                r_uid = self.uid

                counts = getMatch(
                    data, '<em class="W_new_count S_spetxt_bg">(*)</em>')
                if counts and counts.isdigit():
                    counts = long(counts)
                else:
                    counts = 0
                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'
                    while True:
                        try:
                            detailUrl = de_url % (uid, int(time.time() * 1000))
                            #print 'detail_url:', detailUrl
                            request = urllib2.Request(detailUrl)
                            response = urllib2.urlopen(request, timeout=60)
                            ms_content = json.loads(response.read())
                            break
                        except Exception, e:
                            print "Network Exception!!! ", e
                            continue
                #else:
                    html = ms_content["data"]["html"]
                    ms_datas = getMatchList(
                        html,
                        u'(<!-- 单行文字-->|<div class="space">).*?<!--／附件信息-->')
                    # print datas[0]
                    last_time = 0
                    for ms_data in ms_datas:
                        mid_uid = getMatch(ms_data, 'usercard="id=(*)"')
                        mid = getMatch(ms_data, 'mid="(*)"')
                        timestamp = getMatch(
                            ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>')
                        #soup = BeautifulSoup(ms_data)
                        #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"])
                        if timestamp:
                            timestamp = long(getTimeStamp(timestamp))
                            last_time = timestamp
                        else:
                            timestamp = last_time

                        if timestamp < self.lasttime:
                            tags = True
                            print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime
                            #break
                            next

                        text = getMatch(ms_data,
                                        u'<div class="cont">.*?<!--／附件信息-->')
                        if text:
                            text = extractForHTML(text)
                            text = commentExtract(text)

                        if mid_uid == uid:
                            private_type = 'receive'
                        elif mid_uid == r_uid:
                            private_type = 'make'
                        else:
                            private_type = ''

                        wb_item = {
                            'photo_url': photo_url,
                            'uid': uid,
                            'nick_name': nickname,
                            'mid': mid,
                            'timestamp': timestamp,
                            'text': text,
                            'root_uid': r_uid,
                            'weibo_type': _type,
                            'private_type': private_type,
                            'w_new_count': counts,
                            'update_time': self.update_time
                        }

                        wb_json = json.dumps(wb_item)
                        #print 'wb_json:::',wb_json
                        json_list.append(wb_json)

            # 分页
            next_pageUrl = getUrlToPattern(html,
                                           comment_url,
                                           pattern='page',
                                           text_pattern='下一页')
            # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            elif not next_pageUrl or tags:
                break

示例#5

显示文件

    def follow(self):
        cr_url = 'http://weibo.com/p/100505%s/myfollow?t=1&pids=Pl_Official_RelationMyfollow__93' \
                 '&cfs=&Pl_Official_RelationMyfollow__93_page=1#Pl_Official_RelationMyfollow__93'
        json_list = []

        comment_url = cr_url % self.uid
        list_data = []
        while True:
            print "comment_url**comment_url**comment_url**comment_url**", comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url, headers=self._headers)
                    print 1111111111111111
                    response = urllib2.urlopen(request, timeout=60)
                    print 2222222222222222
                    html = response.read().decode('string_escape').replace('\\/', '/')
                    print 3333333333333333
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
            #finally:
            datas = getMatchList(html, '<li class="member_li S_bg1".*?</li>')
                # print len(datas)
                # r_datas = datas.reverse()
            list_data.append(datas)

                # 分页
            next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页')
            print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**",next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            else:
                break

            r_list_data = reversed(list_data)
            for l_datas in r_list_data:
                r_datas = reversed(l_datas)
                for data in r_datas:
                    #print 'data::',data
                    photo_url = getMatch(data, 'profile_image_url=(*)&')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, '<img.*?alt="(*)"')
                    timestamp = int(round(time.time()))
                    time.sleep(1)

                    sex = getMatch(data, '&sex=(*)"')
                    if not sex:
                        sex = ''
                    elif sex == 'f':
                        sex = 'female'
                    elif sex == 'm':
                        sex = 'male'

                    follow_source = getMatch(data, 'class="S_link2" >(*)</a>')
                    if not follow_source:
                        follow_source = ''

                    description = getMatch(data, 'W_autocut S_txt2">(*)</div>')
                    if not description:
                        description = ''

                    gid = getMatch(data, '&gid=(*)&')
                    if not gid:
                        gid = '0'
                    gname = getMatch(data, '&gname=(*)&')
                    if not gname:
                        gname = ''

                    r_uid = self.uid
                    _type = 'follow'

                    #获得关注人的详细信息
                    #user = SinaOperateAPI().getUserShow(uid=uid)

                    wb_item = {
                        'photo_url': photo_url,
                        'uid': uid,
                        'mid': uid,
                        'nick_name': nickname,
                        'timestamp': timestamp,
                        'sex': sex,
                        'description': description,
                        'follow_source': follow_source,
                        'gid': gid,
                        'gname': gname,
                        'root_uid': r_uid,
                        'weibo_type': _type,
                        'update_time': self.update_time
                    }
                    if wb_item['mid'] == None:
                        wb_item['mid'] = ''
                    print "follow, mid", wb_item['mid']
                    print "follow, root_uid", wb_item['root_uid']
                    wb_json = json.dumps(wb_item)
                    # print wb_json
                    json_list.append(wb_json)