Python extractForHTML примеры использования

Язык программирования: Python

Пространство имен/Пакет: tools.HtmlExtractor

Метод/Функция: extractForHTML

Примеров на hotexamples.com: 5

Python extractForHTML - 5 примеров найдено. Это лучшие примеры Python кода для tools.HtmlExtractor.extractForHTML, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

    def commentInbox(self):
        cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    print html
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    time.sleep(5)
                    continue
            #finally:
            datas = getMatchList(
                html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
            # print len(datas)

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, 'page_frame" title="(*)"')
                mid = getMatch(data, '&cid=(*)&')
                timestamp = getMatch(data,
                                     '<div class="WB_from S_txt2">(*)  来自')
                if timestamp:
                    timestamp = long(getTimeStamp(timestamp))
                else:
                    timestamp = 0

                if timestamp <= self.mlasttime:
                    tags = True
                    break

                text = getMatch(data, '<div class="WB_text">(*)</div>')
                if text:
                    text = extractForHTML(text)
                else:
                    text = ''
                r_mid = getMatch(data, 'mid=(*)&')
                r_uid = self.uid
                #commet_type = 'make'
                commet_type = 'receive'

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'comment_type': commet_type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

            # 分页
            next_pageUrl = getUrlToPattern(html,
                                           comment_url,
                                           pattern='page',
                                           text_pattern='下一页')
            # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            elif not next_pageUrl or tags:
                break

Пример #2

Показать файл

    def atMeMicroBlog(self):
        pre_page = 0
        page = 1
        pagebar = 0
        # max_page = 100
        at_MBurl = 'http://weibo.com/aj/at/mblog/list?ajwvr=6&pre_page=%s&page=%s' \
                   '&filter_by_author=0&filter_by_type=0&is_adv=0&pagebar=%s'
        print at_MBurl
        json_list = []
        tags = False

        while True:
            wbUrl = at_MBurl % (pre_page, page, pagebar)
            print "current url: ", wbUrl
            while True:
                try:
                    request = urllib2.Request(wbUrl, headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    mb_content = json.loads(response.read())
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
            # finally:
            html = mb_content["data"]
            print "html****html****html****html****html****", html
            # 分页
            print "html_replace***html_replace***html_replace***", html.replace(
                '\n', '').replace(' ', '')
            print len(html.replace('\n', '').replace(' ', ''))
            print tags
            if html.replace('\n', '').replace(' ', '') == '' or tags:
                break
                # if page > max_page:
                #     break
            elif pre_page < page:
                pre_page += 1
            elif pre_page == page and pagebar == 0:
                pagebar = 1
            elif pagebar == 1:
                pre_page = page
                page += 1
                pagebar = 0

            datas = getMatchList(
                html,
                '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)&')
                nickname = getMatch(data, 'nick-name="(*)"')
                mid = getMatch(data, 'pubuser_nick:(*)"')
                timestamp = getMatch(
                    data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3]
                if timestamp and timestamp.isdigit():
                    timestamp = long(timestamp)
                else:
                    timestamp = 0

                if timestamp <= self.lasttime:
                    tags = True
                    break

                text = getMatch(data, 'feed_list_content" >(*)</div>').strip()
                if text:
                    text = extractForHTML(text.strip())
                else:
                    text = ''

                retweet = getMatch(
                    data,
                    'forward_btn_text">.*?<em>(*)</em>').replace('转发', '')
                if retweet and retweet.isdigit():
                    retweet = long(retweet)
                else:
                    retweet = 0

                comment = getMatch(
                    data,
                    'comment_btn_text">.*?<em>(*)</em>').replace('评论', '')
                if comment and comment.isdigit():
                    comment = long(comment)
                else:
                    comment = 0

                like = getMatch(data, 'UI_ani_praised".*?<em>(*)</em>')
                if like and like.isdigit():
                    like = long(like)
                else:
                    like = 0
                r_mid = getMatch(data, 'rootmid=(*)&')
                r_uid = self.uid

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'retweet': retweet,
                    'comment': comment,
                    'like': like,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

Пример #3

Показать файл

    def messages(self):
        cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1'
        de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
                #print 'html:', html

            #finally:
            datas = getMatchList(
                html,
                '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, '<img.*?alt="(*)"')
                r_uid = self.uid

                counts = getMatch(
                    data, '<em class="W_new_count S_spetxt_bg">(*)</em>')
                if counts and counts.isdigit():
                    counts = long(counts)
                else:
                    counts = 0
                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'
                    while True:
                        try:
                            detailUrl = de_url % (uid, int(time.time() * 1000))
                            #print 'detail_url:', detailUrl
                            request = urllib2.Request(detailUrl)
                            response = urllib2.urlopen(request, timeout=60)
                            ms_content = json.loads(response.read())
                            break
                        except Exception, e:
                            print "Network Exception!!! ", e
                            continue
                #else:
                    html = ms_content["data"]["html"]
                    ms_datas = getMatchList(
                        html,
                        u'(<!-- 单行文字-->|<div class="space">).*?<!--／附件信息-->')
                    # print datas[0]
                    last_time = 0
                    for ms_data in ms_datas:
                        mid_uid = getMatch(ms_data, 'usercard="id=(*)"')
                        mid = getMatch(ms_data, 'mid="(*)"')
                        timestamp = getMatch(
                            ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>')
                        #soup = BeautifulSoup(ms_data)
                        #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"])
                        if timestamp:
                            timestamp = long(getTimeStamp(timestamp))
                            last_time = timestamp
                        else:
                            timestamp = last_time

                        if timestamp < self.lasttime:
                            tags = True
                            print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime
                            #break
                            next

                        text = getMatch(ms_data,
                                        u'<div class="cont">.*?<!--／附件信息-->')
                        if text:
                            text = extractForHTML(text)
                            text = commentExtract(text)

                        if mid_uid == uid:
                            private_type = 'receive'
                        elif mid_uid == r_uid:
                            private_type = 'make'
                        else:
                            private_type = ''

                        wb_item = {
                            'photo_url': photo_url,
                            'uid': uid,
                            'nick_name': nickname,
                            'mid': mid,
                            'timestamp': timestamp,
                            'text': text,
                            'root_uid': r_uid,
                            'weibo_type': _type,
                            'private_type': private_type,
                            'w_new_count': counts,
                            'update_time': self.update_time
                        }

                        wb_json = json.dumps(wb_item)
                        #print 'wb_json:::',wb_json
                        json_list.append(wb_json)

Пример #4

Показать файл

Файл: weibo_feedback_like.py Проект: feifanhanmc/xnr2

                    mid = getMatch(data, '&cid=(*)&')
                    timestamp = getMatch(
                        data, '<div class="WB_from S_txt2">(*)  来自')
                    if timestamp:
                        timestamp = long(getTimeStamp(timestamp))
                    else:
                        timestamp = 0

                    if timestamp <= self.lasttime:
                        tags = True
                        break

                    text = getMatch(data,
                                    '<div class="WB_text S_txt2">(*)</div>')
                    if text:
                        text = extractForHTML(text)
                    else:
                        text = ''
                    r_mid = getMatch(data, 'mid=(*)&')
                    r_uid = self.uid

                    _type = 'stranger'
                    type1 = ''
                    type2 = ''
                    for fljson in self.follow:
                        fjson = json.loads(fljson)
                        if fjson['uid'] == uid:
                            type1 = 'follow'
                            break
                    for fljson in self.fans:
                        fjson = json.loads(fljson)

Пример #5

Показать файл

                    uid = getMatch(data, 'usercard="id=(*)&')
                    nickname = getMatch(data, 'nick-name="(*)"')
                    mid = getMatch(data, 'pubuser_nick:(*)"')
                    timestamp = getMatch(data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3]
                    if timestamp and timestamp.isdigit():
                        timestamp = long(timestamp)
                    else:
                        timestamp = 0
                    
                    if timestamp <= self.lasttime:
                        tags = True
                        break

                    text = getMatch(data, 'feed_list_content" >(*)</div>').strip()
                    if text:
                        text = extractForHTML(text.strip())
                    else:
                        text = ''

                    retweet = getMatch(data, 'forward_btn_text">.*?<em>(*)</em>').replace('转发', '')
                    if retweet and retweet.isdigit():
                        retweet = long(retweet)
                    else:
                        retweet = 0

                    comment = getMatch(data, 'comment_btn_text">.*?<em>(*)</em>').replace('评论', '')
                    if comment and comment.isdigit():
                        comment = long(comment)
                    else:
                        comment = 0