Python can_retry 예제들, utils.utils.can_retry Python 예제들

예제 #1

0

파일 보기

 def summarization_once(self, index):
     """
     get html from news
     """
     print(index)
     texts = []
     if index:
         url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&pn=730&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \
             str(index * 20)
     else:
         url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=1&bs=%E6%AF%92%E7%8B%97%E8%82%89&rsv_bp=1&sr=0&f=8&prevct=no&tn=news&word=%E5%81%B7%E7%8B%97'
     news_lists = get_request_proxy(url, 0)
     if not news_lists:
         if can_retry(url):
             self.summarization_once(index)
         return
     summarization_lists = news_lists.find_all('div', class_='result')
     if not len(summarization_lists):
         if can_retry(url):
             self.summarization_once(index)
         return
     print('num: ', len(summarization_lists), url)
     for summarization in summarization_lists:
         temp_text = summarization.text.replace('\n', '').replace(
             '\xa0', '').replace('\t', '').strip()
         temp_text = ' '.join(temp_text.split())
         texts.append(temp_text[:-8])
     self.summarizations[int(index)] = texts

예제 #2

0

파일 보기

    def get_song_detail(self, id):
        """
        get song detail form playlist
        """

        host = 'http://music.163.com/api/playlist/detail?id=' + str(id)
        json = get_request_proxy(host, 1)
        if json == 0:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        result = json['result']
        tracks = result['tracks']

        if len(tracks) <= 1:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        else:
            playcount = result['playCount']
            for track in tracks:
                songid = track['id']
                songname = track['name']
                self.songlist.append([songid, songname, playcount])
            self.finishlist.append(id)

예제 #3

0

파일 보기

 def detail_once(self, index, url):
     """
     get html from news
     """
     # print(index)
     news_lists = get_request_proxy(url, 0)
     if not news_lists:
         if can_retry(url):
             self.detail_once(index, url)
         return
     test = news_lists.find_all('div',
                                class_=[
                                    'article-content', 'mth-editor-content',
                                    'con-news-art', 'Custom_UnionStyle'
                                ])
     if not len(test):
         test = self.cleantxt(news_lists.text)
         if not len(test):
             if can_retry(url):
                 self.detail_once(index, url)
             return
         self.word_list[index] = test
         return
     word_list = ''.join([index.text
                          for index in test]).replace('\u3000',
                                                      '').replace('\n', '')
     self.word_list[int(index)] = word_list

예제 #4

0

파일 보기

    def summarization_once(self, index):
        """
        get html from news
        """
        print(index)
        texts = []
        hrefs = []
        if index:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \
                str(index * 10)
        else:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2'
        news_lists = basic_req(url, 0)
        href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF'])
        summarization_lists = news_lists.find_all('div', class_='gG0TJc')

        if not len(href_lists) or not len(summarization_lists):
            if can_retry(url):
                self.summarization_once(index)
            return
        print('num: ', len(summarization_lists), url)
        for href in href_lists:
            hrefs.append(href['href'])
        for summarization in summarization_lists:
            temp_text = summarization.text.replace('\n', '').replace(
                '\xa0', '').replace('\t', '').replace('...', '').strip()
            temp_text = ' '.join(temp_text.split())
            texts.append(temp_text)
        self.summarizations[int(index)] = texts
        self.hrefs[int(index)] = hrefs

예제 #5

0

파일 보기

    def load_spot_once(self, pn=1, city_id=10186):
        ''' load spot once '''
        data = {
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': city_id,
            'iTagId': 0,
            'iPage': pn,
        }
        data = self.load_sn(data)
        print(data)
        req = get_request_proxy(self.AJAX_ROUTER_URL, 11, data=data)
        if req is None or not 'data' in req or not 'list' in req['data']:
            if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)):
                self.load_spot_once(pn, city_id)
            return
        spot_list = req['data']['list']
        spot_pn = req['data']['page']
        spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list)
        try:
            total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0])
        except Exception as e:
            total_pn = 1
            echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e)

        if city_id not in self.spot_result:
            self.spot_result[city_id] = spot_tmp
        else:
            self.spot_result[city_id] += spot_tmp
        self.spot_pn[city_id] = total_pn

예제 #6

0

파일 보기

    def get_goods_id_first(self, origin_url, index):
        """
        get goods id first
        """

        origin_url = origin_url.replace('https', 'http')
        # first_result = get_request_proxy(origin_url, 0)
        first_result = basic_req(origin_url, 0, header=self.headers)

        if not first_result or len(first_result.find_all('script')) < 2:
            if can_retry(origin_url):
                self.get_goods_id_first(origin_url, index)
            return

        wait = first_result.find_all('script')[1].text
        if not '"title":"' in wait:
            return
        title = re.findall('"title":".*","',
                           wait)[0].split('","')[0].split('":"')[1]
        if title in self.title2map:
            self.goods_map[index] = self.title2map[title]
            self.url2goods[origin_url] = self.title2map[title]

            print(self.title2map[title])
        else:
            print(title)

예제 #7

0

파일 보기

 def _getroom_id(self, next_to=True, proxy=True):
     ''' get av room id '''
     url = self.ROOM_INIT_URL % self._av_id
     html = get_request_proxy(url, 0) if proxy else basic_req(url, 0)
     head = html.find_all('head')
     if not len(head) or len(
             head[0].find_all('script')) < 4 or not '{' in head[0].find_all(
                 'script')[3].text:
         if can_retry(url):
             self._getroom_id(proxy=proxy)
         else:
             self._getroom_id(proxy=False)
         next_to = False
     if next_to:
         script_list = head[0].find_all('script')[3].text
         script_begin = script_list.index('{')
         script_end = script_list.index(';')
         script_data = script_list[script_begin:script_end]
         json_data = json.loads(script_data)
         if self._p == -1 or len(json_data['videoData']['pages']) < self._p:
             self._room_id = json_data['videoData']['cid']
         else:
             self._room_id = json_data['videoData']['pages'][self._p -
                                                             1]['cid']
         print('Room_id:', self._room_id)

예제 #8

0

파일 보기

파일: bilibili.py 프로젝트: baidong1980/spider

    def check_comment_once(self, av_id: int, pn: int):
        ''' check comment once '''
        url = self.REPLY_V2_URL % (pn, av_id)
        json_req = get_request_proxy(url, 1)
        if json_req is None or not 'data' in json_req or not 'hots' in json_req[
                'data']:
            if can_retry(url):
                self.check_comment_once(av_id, pn)
            return

        hots = json_req['data']['hots']
        replies = json_req['data']['replies']
        temp_floor = [] if replies is None else [ii['floor'] for ii in replies]
        if replies is None:
            wait_check = [] if hots is None else hots
        else:
            wait_check = replies if hots is None else [*hots, *replies]

        for ii in wait_check:
            info = {'basic': self.get_comment_detail(ii, av_id, pn)}
            floor = info['basic'][0]
            crep = ii['replies']

            if not crep is None:
                info['replies'] = [
                    self.get_comment_detail(ii, av_id, pn, floor)
                    for ii in crep
                ]
            self.comment[av_id][floor] = info
        if len(temp_floor):
            for ii in range(min(temp_floor), max(temp_floor) + 1):
                if not ii in self.comment[av_id]:
                    self.comment[av_id][ii] = {}
            self.comment_max[av_id] = min(temp_floor)

예제 #9

0

파일 보기

파일: zimuzu.py 프로젝트: 123liwei/spider

    def load_url(self):
        """
        load url form zimuzu
        """

        url = 'http://zmz005.com/o5itP3'
        detail = get_request_proxy(url, 0)
        total = []

        if not detail:
            print('retry')
            if can_retry(url):
                self.load_url()
            return
        season_list = detail.find_all('div',
                                      class_='tab-content info-content')[1:]
        for season in season_list:
            quality_list = season.find_all('div', class_='tab-pane')
            url_body = quality_list[1] if 'APP' in quality_list[0][
                'id'] else quality_list[0]
            season_id = re.findall(r"\d+\.?\d*", url_body['id'])[0]
            total.append(season_id)
            if int(season_id) < 12:
                url_body = quality_list[1]

            url_list = url_body.find_all('ul', class_='down-links')
            url = [
                index.find_all('div', class_='copy-link')[1]['data-url']
                for index in url_list
            ]
            total.append('\n'.join(url) + '\n')
        with codecs.open('zimuzu/data/southPark', 'w', encoding='utf-8') as f:
            f.write('\n'.join(total))

예제 #10

0

파일 보기

파일: titleviews.py 프로젝트: baidong1980/spider

    def get_request_v2(self, url, types, header):

        result = get_request_proxy(url, 0, header=header)

        if not result or not len(result.find_all('div', class_='content')):
            if can_retry(url):
                self.get_request_v2(url, types, header)
            return
        return result

예제 #11

0

파일 보기

파일: titleviews.py 프로젝트: baidong1980/spider

    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result

예제 #12

0

파일 보기

파일: titleviews.py 프로젝트: baidong1980/spider

    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(
                result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result

예제 #13

0

파일 보기

파일: bilibili.py 프로젝트: baidong1980/spider

    def get_check(self):
        ''' check comment '''
        now_hour = int(time_str(format='%H'))
        now_min = int(time_str(format='%M'))
        now_time = now_hour + now_min / 60
        if now_time > 0.5 and now_time < 8.5:
            return
        if os.path.exists('{}comment.pkl'.format(comment_dir)):
            with codecs.open('{}comment.pkl'.format(comment_dir), 'rb') as f:
                self.comment = pickle.load(f)
        if self.assign_up_mid == -1:
            return
        url = self.MEMBER_SUBMIT_URL % self.assign_up_mid
        json_req = get_request_proxy(url, 1)
        if json_req is None or not 'data' in json_req or not 'vlist' in json_req[
                'data']:
            if can_retry(url):
                self.get_check()
            return
        av_id_list = [[ii['aid'], ii['comment']]
                      for ii in json_req['data']['vlist']]
        if self.basic_av_id not in [ii[0] for ii in av_id_list]:
            if can_retry(url):
                self.get_check()
            return

        threading_list = []
        for (ii, jj) in av_id_list:
            if ii not in self.comment:
                self.comment[ii] = {}
            work = threading.Thread(target=self.comment_check_schedule,
                                    args=(
                                        ii,
                                        jj,
                                    ))
            threading_list.append(work)
        for work in threading_list:
            work.start()
        for work in threading_list:
            work.join()
        with codecs.open('{}comment.pkl'.format(comment_dir), 'wb') as f:
            pickle.dump(self.comment, f)
        return av_id_list

예제 #14

0

파일 보기

 def href_once(self, index):
     """
     get html from news
     """
     print(index)
     texts = []
     url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=毒狗肉&pn=' + \
         str(index * 10)
     news_lists = get_request_proxy(url, 0)
     if not news_lists:
         if can_retry(url):
             self.href_once(index)
         return
     test = news_lists.find_all('div', class_='result')
     if not len(test):
         if can_retry(url):
             self.href_once(index)
         return
     href_list = [index.a['href'] for index in test]
     self.href_map[int(index)] = href_list

예제 #15

0

파일 보기

 def request_text(self, url):
     ''' requests text '''
     req = basic_req(url, 2)
     if req is None:
         echo(0, url)
         if can_retry(url):
             self.request_text(url)
         else:
             return ''
     else:
         echo(1, url)
         return req.text

예제 #16

0

파일 보기

파일: bilibili.py 프로젝트: baidong1980/spider

    def check_type_req(self, av_id: int):
        changeHeaders({'Referer': self.BASIC_AV_URL % av_id})
        url = self.VIEW_URL % av_id

        json_req = get_request_proxy(url, 1)

        if json_req is None or 'data' not in json_req or 'tid' not in json_req[
                'data']:
            if can_retry(url):
                self.check_type_req(av_id)
            return
        self.rank_type[av_id] = json_req['data']['tid'] == self.assign_tid

예제 #17

0

파일 보기

    def get_goods_second(self, url, index):

        second_result = basic_req(url, 0, header=self.headers)
        # second_result = get_request_proxy(url, 0)

        if not second_result or not len(second_result.find_all('input')):
            if can_retry(url):
                self.get_goods_second(url, index)
            return
        goods_id = second_result.find_all('input')[6]['value']
        print(goods_id)
        self.goods_map[index] = goods_id

예제 #18

0

파일 보기

    def get_playlist_id(self, classify, offset):
        """
        get playlist id from classify
        """

        host = 'https://music.163.com'
        allclassify = classify == '全部风格'
        url = host + self.classifylist[classify] + (
            '?' if allclassify else
            '&') + 'order=hot&limit=35&offset=' + str(offset)
        html = basic_req(url, 0)

        if not html:
            if can_retry(url):
                self.get_playlist_id(classify, offset)
            return []
        alist = html.find_all('a', class_='icon-play')
        if not len(alist):
            if can_retry(url):
                self.get_playlist_id(classify, offset)
        for index in alist:
            self.playlists.append(index['data-res-id'])

예제 #19

0

파일 보기

 def load_img(self, index, img_id, img_url):
     """
     load img
     """
     img = get_request_proxy(img_url, 2)
     if img == True or img == False:
         if can_retry(img_url):
             self.load_img(index, img_id, img_url)
         return
     with codecs.open(
             'buildmd/' + self.find_title(index).split('/')[0] + '/img/' +
             self.find_title(index).split('/')[1][:-3] + str(img_id + 1) +
             '.jpg', 'wb') as f:
         f.write(img.content)

예제 #20

0

파일 보기

 def summarization_once(self, index):
     """
     get html from news
     """
     print(index)
     texts = []
     url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \
         str(index * 10)
     news_lists = get_request_proxy(url, 0)
     if not news_lists:
         if can_retry(url):
             self.summarization_once(index)
         return
     test = news_lists.find_all(
         'div', class_=['c-row c-gap-top-small', 'c-span18 c-span-last'])
     word = self.cleantxt(news_lists.text)
     if not len(word):
         if can_retry(url):
             self.summarization_once(index)
         return
     temp_map = self.find_location.test_province(
         self.find_location.city_province, word)
     self.total_map[int(index)] = temp_map
     self.word[index] = word

예제 #21

0

파일 보기

 def get_lists(self):
     """
     get title lists
     """
     url = self.joint_url('3bb0c25eca85e764b6d55a281faf7195')
     title_json = get_request_proxy(url, 1)
     if not title_json:
         if can_retry(url):
             self.get_lists()
         return
     content = BeautifulSoup(title_json['content'],
                             'html.parser').find_all('a')
     self.request_list = [
         re.split(r'/|=', index.text)[-1] for index in content
     ]

예제 #22

0

파일 보기

    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        version = begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = get_request_proxy(host, 0)

        if not html:
            print('Empty')
            if can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        end_time(version)

예제 #23

0

파일 보기

    def build_md_once(self, index, tid):
        """
        build md in one
        """
        url = self.joint_url(tid)
        title_json = get_request_proxy(url, 1)
        if not title_json:
            if can_retry(url, index):
                self.build_md_once(index, tid)
            return
        content = BeautifulSoup(title_json['content'],
                                'html.parser').find_all('div')
        text = []
        img_href = []
        img_id = 1
        ttid = 1
        img_title = self.find_title(index).split('/')[1][:-3]
        for word in content:
            temp_text = ''
            if word.span and len(
                    word.span.text) and not word.span.text[0].isdigit:
                temp_text = '## ' + word.span.text
                ttid = 1
            if word.img:
                temp_text = '![image](img/' + img_title + str(img_id) + '.jpg)'
                img_href.append(word.img['src'].replace('https', 'http'))
                img_id += 1

            if not len(temp_text):
                temp_text = word.text
                if len(temp_text) and temp_text[0].isdigit():
                    temp_text = str(ttid) + '. **' + \
                        ' '.join(temp_text.split('\xa0')[1:]).strip() + '**'
                    ttid += 1
                if len(temp_text) and temp_text[0:2] == '//':
                    temp_text = str(ttid) + '. **' + \
                        ' '.join(temp_text.split('\xa0')[2:]).strip() + '**'
                    ttid += 1
                if len(temp_text) and (temp_text[0] == '￥'
                                       or temp_text[0] == '€'):
                    temp_text = '<a>' + temp_text + '</a>'
            text.append(temp_text)
        with codecs.open(data_dir + self.find_title(index),
                         'w',
                         encoding='utf-8') as f:
            f.write('\n'.join(text))
        self.img_map[index] = img_href
        print(index, len(img_href))

예제 #24

0

파일 보기

 def search_goods_once(self, goods_name, index):
     if not os.path.exists('%scookie_alimama' % data_dir):
         print('alimama cookie not exist!!!')
         return
     with codecs.open('%scookie_alimama' % data_dir, 'r',
                      encoding='utf-8') as f:
         cookie = f.readlines()
     url_list = [
         'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=',
         cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=',
         str(int(round(time.time() * 1000))), '&_t=',
         str(int(round(time.time() * 1000))), '&q=', goods_name
     ]
     headers = {
         'pragma':
         'no-cache',
         'X-Requested-With':
         'XMLHttpRequest',
         'cache-control':
         'no-cache',
         'Cookie':
         '',
         'Content-Type':
         'application/x-www-form-urlencoded;charset=UTF-8',
         'Accept':
         'application/json, text/javascript, */*; q=0.01',
         "Accept-Encoding":
         "",
         "Accept-Language":
         "zh-CN,zh;q=0.9",
         "User-Agent":
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
     }
     headers['Cookie'] = cookie[0][:-1]
     ca = basic_req(''.join(url_list), 2, header=headers)
     if ca.status_code != 200 or not 'data' in ca.json():
         if can_retry(''.join(url_list)):
             self.search_goods_once(goods_name, index)
         return
     page_list = ca.json()['data']['pageList']
     title = [
         '||'.join(
             [str(index['auctionId']), goods_name,
              str(index['zkPrice'])]) for index in page_list
     ][0]
     self.goods_name[index] = title
     print(title)

예제 #25

0

파일 보기

    def load_collect_once(self, index):
        """
        load taobao collect
        """
        baseurl = 'https://shoucang.taobao.com/item_collect_n.htm?t='
        url = baseurl + str(int(round(time.time() * 1000)))
        if index:
            url += 'ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=' + \
                str(30 * index)

        collect_html = basic_req(url, 0)
        if collect_html != True and collect_html != False:
            collect_list = collect_html.find_all(
                'li',
                class_=[
                    "J_FavListItem g-i-item fav-item ",
                    "J_FavListItem g-i-item fav-item isinvalid",
                    "J_FavListItem g-i-item fav-item istmall ",
                    "J_FavListItem g-i-item fav-item istmall isinvalid"
                ])
            print(len(collect_list))
        if collect_html == True or collect_html == False or not len(
                collect_list):
            if can_retry(baseurl + str(index), index):
                self.load_collect_once(index)
            return
        text = []
        for collect in collect_list:
            data_id = collect['data-id']
            # data_ownerid = collect['data-ownerid']
            title = collect.find_all('a', class_='img-item-title-link')[0].text

            price = collect.find_all(
                'div', class_='g_price')[0].strong.text if len(
                    collect.find_all('div', class_='g_price')) else '0'
            text.append("||".join([data_id, title, price]))

        self.collect[index] = text

예제 #26

0

파일 보기

    def load_goods_once(self, index, tid):
        """
        build md in one
        """
        url = self.joint_url(tid)
        title_json = get_request_proxy(url, 1)
        if not title_json:
            if can_retry(url, index):
                self.load_goods_once(index, tid)
            return
        content = BeautifulSoup(title_json['content'], 'html.parser')
        # return content
        content = content.find_all('div')
        if not len(content):
            if can_retry(url, index):
                self.load_goods_once(index, tid)
            return
        # print(len(content))
        text = []
        ttid = 0
        text.append(self.find_title(index))
        good_text = []
        describe = []
        title = ''
        url = ''
        tpud = ''

        for word in content:
            temp_text = ''
            temp_text = word.text
            if not len(temp_text):
                continue
            if len(
                    temp_text
            ) and temp_text not in self.special_list and not '€' in temp_text and (
                (temp_text[0].isdigit() and
                 (not '【' in temp_text or '【已下架】' in temp_text)) or
                (temp_text[0] == '\xa0' and not 'http' in temp_text
                 and not '￥' in temp_text and not '微信' in temp_text
                 and not '(' in temp_text) or
                (word.span and len(word.span.text.replace('\xa0', '')) and
                 (word.span['style']
                  == 'font-size:16px;color:#fc9db1;font-weight:bold;'
                  or word.span['style'] ==
                  'font-size:16px;color:#1e6792;background-color:#ffffff;font-weight:bold;'
                  ))):
                temp_text = temp_text.replace('\xa0', ' ').replace('|', '')
                temp_text = temp_text.replace('//', '').replace('￥',
                                                                '').strip()
                if not re.search(r'\d\.\d', temp_text):
                    temp_text = temp_text.replace('.', ' ')
                elif temp_text.count('.') > 1:
                    temp_text = temp_text.replace('.', ' ', 1)
                temp_list = temp_text.split()
                print(temp_list)
                if not len(temp_list):
                    continue
                if ttid:
                    text.append(' '.join([*good_text, *[url, tpud]]))
                url = ''
                tpud = ''
                ttid += 1
                describe = []
                good_text = []
                if len(title):
                    text.append(title)
                    title = ''
                if temp_list[0].isdigit():
                    good_text.append(str(int(temp_list[0])))
                else:
                    good_text.append(str(ttid))
                    good_text.append(temp_list[0])
                if len(temp_list) == 1:
                    continue
                if len(good_text) == 1:
                    good_text.append(temp_list[1])
                elif temp_list[1].isdigit():
                    good_text.append(str(int(temp_list[1])))
                    if len(temp_list) > 2:
                        describe = temp_list[2:]
                if len(temp_list) > 2 and temp_list[2].isdigit():
                    good_text.append(str(int(temp_list[2])))
                elif len(temp_list) > 3 and temp_list[3].isdigit():
                    good_text.append(str(int(temp_list[3])))
                    describe = temp_list[2]
                    if len(temp_list) > 4:
                        describe = [*describe, *temp_list[4:]]
                elif len(temp_list) > 3 and len(
                        temp_list[2]) > 3 and temp_list[2][2:].isdigit():
                    if len(temp_list[3]) > 3 and temp_list[3][2:].isdigit():
                        good_text.append(temp_list[2] + '/' + temp_list[3])
                    else:
                        good_text.append(str(int(temp_list[2][2:])))
                    continue
                elif len(temp_list) > 2 and re.search(r'\d', temp_list[2]):
                    digit_list = re.findall(r"\d+\.?\d*", temp_list[2])
                    good_text.append(digit_list[0])
                    if len(temp_list) > 3:
                        describe = [*describe, *temp_list[3:]]
                elif len(temp_list) > 2:
                    describe.append(temp_list[2])
                if len(temp_list) > 3:
                    describe = temp_list[3:]
            elif 'http' in temp_text:
                temp_text = temp_text.replace('\xa0', '').strip()
                print('http', temp_text)
                url = temp_text
            elif temp_text.count('€') == 2 or temp_text.count('￥') == 2:
                temp_text = temp_text.replace('\xa0', '').strip()
                print('￥', temp_text)
                tpud = temp_text
            elif '【店铺链接】' in temp_text:
                temp_text = temp_text.replace('\xa0', '').strip()
                print('【店铺链接】', temp_text)
                url += temp_text
            elif temp_text in self.title_list:
                print(2, temp_text)
                temp_text = temp_text.replace('\xa0', '')
                title = temp_text
            elif len(good_text) == 1:
                temp_text = temp_text.replace('\xa0',
                                              ' ').replace('.', ' ').replace(
                                                  '￥', '').replace('|', '')
                temp_list = temp_text.split()
                print(3, temp_list)
                if not len(temp_list):
                    continue
                elif len(temp_list) > 1 and temp_list[1].isdigit():
                    good_text.append(temp_list[0])
                    good_text.append(str(int(temp_list[1])))
                    describe = temp_list[2:]
                else:
                    describe.append(temp_text)
            elif temp_text.count('￥') == 1:
                temp_text = temp_text.replace('￥',
                                              '').replace('\xa0', '').replace(
                                                  '|', '').strip()
                digit_list = re.findall(r"\d+\.?\d*", temp_text)
                print('$', digit_list)
                if len(digit_list):
                    good_text.append(digit_list[0])
            else:
                temp_text = temp_text.replace('\xa0', '')
                print(4, temp_text)
                describe.append(temp_text)
        if len(good_text):
            text.append(' '.join([*good_text, *[url, tpud]]))

        text.append(' ')
        self.goods[index] = text
        print(len(text))

예제 #27

0

파일 보기

파일: bilibili.py 프로젝트: baidong1980/spider

    def load_rank_index(self, index: int, day_index: int):
        ''' load rank '''
        changeHeaders({'Referer': self.AV_URL})
        url = self.RANKING_URL % (index, day_index)
        text = basic_req(url, 3)
        rank_str = re.findall('window.__INITIAL_STATE__=(.*?);', text)
        if not len(rank_str):
            if can_retry(url):
                self.load_rank_index(index, day_index)
            return False
        rank_map = json.loads(rank_str[0])
        rank_list = rank_map['rankList']

        now_av_id = []
        wait_check_public = []
        rank_map = {}

        for ii, rank in enumerate(rank_list):
            av_id = int(rank['aid'])
            need_params = [
                'pts', 'author', 'mid', 'play', 'video_review', 'coins',
                'duration', 'title'
            ]
            temp_rank_list = [
                ii, *[rank[ii] for ii in need_params], index, day_index
            ]
            now_av_id.append(av_id)
            if not self.check_type(av_id):
                continue
            self.check_rank_rose(av_id, temp_rank_list)
            if self.add_av(av_id, ii, temp_rank_list[1]):
                rank_map[av_id] = temp_rank_list
        ''' check assign av rank '''
        for ii in self.assign_ids:
            if not ii in self.public:
                wait_check_public.append(ii)
            if not ii in self.last_view and not ii in self.rank_map:
                self.rank_map[ii] = []
        have_assign = len([0 for ii in self.assign_ids if ii in now_av_id]) > 0
        ''' check tid type '''
        threading_public = []
        for ii in rank_map.keys():
            work = threading.Thread(target=self.check_type_req, args=(ii, ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()

        for ii, jj in rank_map.items():
            if self.check_type(ii) != True:
                continue
            if not ii in self.public:
                wait_check_public.append(ii)
            self.last_check[ii] = int(time.time())
            self.rank_map[ii] = jj
        ''' load public basic data '''
        threading_public = []
        for ii in wait_check_public:
            work = threading.Thread(target=self.public_data, args=(
                ii,
                0,
            ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()
        ''' begin monitor '''
        threading_list = []
        for ii, jj in self.public.items():
            if not ii in self.public_list and jj[0] + one_day > int(
                    time.time()):
                work = threading.Thread(target=self.public_monitor,
                                        args=(
                                            ii,
                                            0,
                                        ))
                threading_list.append(work)
        for work in threading_list:
            work.start()
        return have_assign