Python basic_req示例，util.util.basic_req Python示例

示例#1

0

显示文件

    def prepare_js(self):
        ''' prepare js '''
        pre_text = basic_req(self.JD_URL, 3)
        INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t',
                                  pre_text)[0].split('"')[1]
        origin_js = basic_req(INDEX_JS_URL, 3)
        ''' decoder js '''
        decode_js = codecs.unicode_escape_decode(origin_js)[0]
        ''' params replace '''
        replace_list_str = decode_js.split(';')[2]
        empty_index = replace_list_str.index(' ') + 1
        begin_index = replace_list_str.index('=[') + 2
        end_index = replace_list_str.index(']')
        replace_list = replace_list_str[begin_index:end_index].split(',')
        rp = replace_list_str[empty_index:begin_index - 2]
        for ii, jj in enumerate(replace_list):
            decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj)
        self.slat = replace_list[46].replace('"', '')
        echo(2, 'salt', self.slat)
        ''' load to local '''
        with open(decoder_js_path, 'w') as f:
            f.write(';\n'.join(decode_js.split(';')))
        ''' del function about ajax '''
        del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js)
        del_begin_index = decode_js.index(del_str[0])

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]
        self.result_js = result_js
        self.js_compile = execjs.compile(open(hotel_js_path).read())
        echo(1, 'Load hotel index js success!!!')

示例#2

0

显示文件

文件： buildmd.py 项目： yxlspider/spider

    def test_change_youdaoyun(self, atricle_id, body, article_name):
        """
        change youdaoyun article demo
        @param 'buildmd/data/cookie': cookie in youdaoyun web
        @param            article_id: change article No.
        @param:                 body: change article body
        @param:         article_name: change article name
        """
        url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8'
        headers = {
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'Cookie': '',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept': 'application/json, text/plain, */*',
            "Accept-Encoding": "",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
            'Origin': 'https://note.youdao.com',
            'Referer': 'https://note.youdao.com/web'
        }
        if not os.path.exists('%scookie' % data_dir):
            print('Youdao Note cookie not exist!!!')
            return

        with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
            cookie = f.readline()
        headers['cookie'] = cookie[:-1]
        headers['Host'] = url.split('/')[2]

        file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8'
        file_data = {'cstk': 'E3CF_lx8'}
        ca = basic_req(file_list_url, 11, data=file_data, header=headers)
        if not len(ca):
            print('List Error')
            return
        change_data_origin = ca[atricle_id]['fileEntry']
        body_string = [
            '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>',
            body, '</text><inline-styles/><styles/></para></body></note>'
        ]
        change_data = {
            'name': article_name,
            'fileId': change_data_origin['id'],
            'parentId': change_data_origin['parentId'],
            'domain': change_data_origin['domain'],
            'rootVersion': -1,
            'sessionId': '',
            'modifyTime': int(round(time.time())),
            'bodyString': "".join(body_string),
            'transactionId': change_data_origin['id'],
            'transactionTime': int(round(time.time())),
            'orgEditorType': change_data_origin['orgEditorType'],
            'tags': change_data_origin['tags'],
            'cstk': 'E3CF_lx8'
        }
        print(change_data)
        cb = basic_req(url, 12, data=change_data, header=headers)
        return cb

示例#3

0

显示文件

    def test_change_youdaoyun(self, atricle_id, body, article_name):
        """
        change youdaoyun article demo
        @param 'buildmd/data/cookie': cookie in youdaoyun web
        @param            article_id: change article No.
        @param:                 body: change article body
        @param:         article_name: change article name
        """
        url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8'
        headers = {
            'Cookie': '',
            'Content-Type': get_content_type(),
            'Accept': get_accept('xhr'),
            'Origin': 'https://note.youdao.com',
            'Referer': 'https://note.youdao.com/web'
        }
        if not os.path.exists('%scookie' % data_dir):
            print('Youdao Note cookie not exist!!!')
            return

        with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
            cookie = f.readline()
        headers['cookie'] = cookie[:-1]
        headers['Host'] = url.split('/')[2]

        file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8'
        file_data = {'cstk': 'E3CF_lx8'}
        ca = basic_req(file_list_url, 11, data=file_data, header=headers)
        if not len(ca):
            print('List Error')
            return
        change_data_origin = ca[atricle_id]['fileEntry']
        body_string = [
            '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>',
            body, '</text><inline-styles/><styles/></para></body></note>'
        ]
        change_data = {
            'name': article_name,
            'fileId': change_data_origin['id'],
            'parentId': change_data_origin['parentId'],
            'domain': change_data_origin['domain'],
            'rootVersion': -1,
            'sessionId': '',
            'modifyTime': int(round(time.time())),
            'bodyString': "".join(body_string),
            'transactionId': change_data_origin['id'],
            'transactionTime': int(round(time.time())),
            'orgEditorType': change_data_origin['orgEditorType'],
            'tags': change_data_origin['tags'],
            'cstk': 'E3CF_lx8'
        }
        print(change_data)
        cb = basic_req(url, 12, data=change_data, header=headers)
        return cb

示例#4

0

显示文件

文件： titleviews.py 项目： zhujuanzhu/spider

    def get_request(self, url: str, types: int, functs, header: dict = {}):
        if len(header):
            req = basic_req(url, types, header=header)
        else:
            req = basic_req(url, types)

        if functs(req):
            if can_retry(url):
                self.get_request(url, types, functs, header)
            return
        return req

示例#5

0

显示文件

    def get_goods_id_first(self, origin_url, index):
        """
        get goods id first
        """

        origin_url = origin_url.replace('https', 'http')
        # first_result = proxy_req(origin_url, 0)
        first_result = basic_req(origin_url, 0, header=self.headers)

        if not first_result or len(first_result.find_all('script')) < 2:
            if can_retry(origin_url):
                self.get_goods_id_first(origin_url, index)
            return

        wait = first_result.find_all('script')[1].text
        if not '"title":"' in wait:
            return
        title = re.findall('"title":".*","',
                           wait)[0].split('","')[0].split('":"')[1]
        if title in self.title2map:
            self.goods_map[index] = self.title2map[title]
            self.url2goods[origin_url] = self.title2map[title]

            print(self.title2map[title])
        else:
            print(title)

示例#6

0

显示文件

文件： bsocket.py 项目： Marin111/spider-1

 def _getroom_id(self, next_to=True, proxy=True):
     ''' get av room id '''
     url = self.ROOM_INIT_URL % self._av_id
     html = proxy_req(url, 0) if proxy else basic_req(url, 0)
     head = html.find_all('head')
     if not len(head) or len(
             head[0].find_all('script')) < 4 or not '{' in head[0].find_all(
                 'script')[3].text:
         if can_retry(url):
             self._getroom_id(proxy=proxy)
         else:
             self._getroom_id(proxy=False)
         next_to = False
     if next_to:
         script_list = head[0].find_all('script')[3].text
         script_begin = script_list.index('{')
         script_end = script_list.index(';')
         script_data = script_list[script_begin:script_end]
         json_data = json.loads(script_data)
         if self._p == -1 or len(json_data['videoData']['pages']) < self._p:
             self._room_id = json_data['videoData']['cid']
         else:
             self._room_id = json_data['videoData']['pages'][self._p -
                                                             1]['cid']
         print('Room_id:', self._room_id)

示例#7

0

显示文件

    def summarization_once(self, index):
        """
        get html from news
        """
        print(index)
        texts = []
        hrefs = []
        if index:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \
                str(index * 10)
        else:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2'
        news_lists = basic_req(url, 0)
        href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF'])
        summarization_lists = news_lists.find_all('div', class_='gG0TJc')

        if not len(href_lists) or not len(summarization_lists):
            if can_retry(url):
                self.summarization_once(index)
            return
        print('num: ', len(summarization_lists), url)
        for href in href_lists:
            hrefs.append(href['href'])
        for summarization in summarization_lists:
            temp_text = summarization.text.replace('\n', '').replace(
                '\xa0', '').replace('\t', '').replace('...', '').strip()
            temp_text = ' '.join(temp_text.split())
            texts.append(temp_text)
        self.summarizations[int(index)] = texts
        self.hrefs[int(index)] = hrefs

示例#8

0

显示文件

 def update_article(self, article_id: str, article_body: str):
     p = self.share2article[article_id][-2].split("/")[-1]
     article_info = self.list_recent[p]
     data = {
         "fileId": p,
         "parentId": article_info["parentId"],
         "domain": article_info["domain"],
         "rootVersion": -1,
         "sessionId": "",
         "modifyTime": int(time_stamp()),
         "bodyString": article_body,
         "transactionId": p,
         "transactionTime": int(time_stamp()),
         "orgEditorType": article_info["orgEditorType"],
         "tags": article_info["tags"],
         "cstk": self.cstk,
     }
     url = self.SYNC_URL % ("push", self.cstk)
     req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1))
     if req is None or list(req.keys()) != [
         "entry",
         "meta",
         "effectedShareEntries",
         "forcePullVersion",
         "effected",
     ]:
         echo(
             "0|error",
             "Update atricle_id {} Error".format(article_id),
             req.json() if req is not None else "",
         )
         return False
     echo("1|warning", "Update atricle_id {} Success!!!".format(article_id))
     return True

示例#9

0

显示文件

def req_ip66():
    ''' 66ip.cn js decoder '''
    header['Cookie'] = generate_cookie()

    req_text = basic_req(IP66_URL, 3, header=header)
    echo(2, req_text)
    return req_text

示例#10

0

显示文件

def generate_cookie():
    ''' eval 66ip.cn test in 19.5.7 '''
    req = basic_req(IP66_URL, 2, header=header)
    basic_cookie = req.cookies.get_dict()

    ''' !important \b in py -> \x80 '''
    req_text = r'{}'.format(req.text)

    ''' get the script will be eval '''
    script_text = re.findall('<script>(.*?)</script>', req_text)[0]
    script_text = script_text.replace(
        '{eval(', '{aaa=').replace(');break', ';break')
    script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa'))
    echo(0, script_eval)

    try:
        ''' replace document & window '''
        params = re.findall(
            r'(__jsl_clearance=.*?)\'\+\(function\(\){(.*?join\(\'\'\))}\)\(\)', script_eval)
        wait_eval = params[0][1].replace(
            "document.createElement('div')", "{}").replace("", '')
        wait_replace = re.findall(
            r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0]
        wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";')

        ''' eval & encoder cookie '''
        other_param = js2py.eval_js(
            'function ddd() {window={};' + wait_eval + '}ddd()')
        cookie = '{}; {}{}'.format(encoder_cookie(
            basic_cookie), params[0][0], other_param)
        echo(1, 'cookie', cookie)

        return cookie
    except:
        generate_cookie()

示例#11

0

显示文件

 def load_comment_v2(self, movie_id: int, start: int):
     ''' load comment by proxy'''
     url = self.COMMENT_PROXY_URL % (movie_id, start)
     self.generate_cookie()
     comment_json = basic_req(url, 1)
     if comment_json is None or not 'comments' in comment_json:
         if not comment_json is None and 'code' in comment_json:
             if comment_json['code'] == 5000:
                 self.finish_list[(movie_id, start)] = 0
                 self.checkpoint()
             else:
                 comment_json['code'] == 112
                 self.proxy_can_use = False
                 echo(2, url, 'Failed')
                 self.again_list.append([movie_id, start])
         else:
             self.again_list.append([movie_id, start])
             echo(0, url, 'Failed')
         return
     comment_html = comment_json['comments']
     comment = {(movie_id, ii['author']['id']): [ii['author']['name'], ii['author']['id'],
                                                 ii['created_at'], ii['content'], '', ii['rating']['value']] for ii in comment_html}
     user_list = {ii['author']['id'] for ii in comment_html}
     self.user_info = {*self.user_info, *user_list}
     self.comment = {**self.comment, **comment}
     if len(user_list) == 100:
         self.more_user.append([movie_id, start + 100])
     self.finish_list[(movie_id, start)] = 0
     self.finish_list[(movie_id, start + 20)] = 0
     self.finish_list[(movie_id, start + 40)] = 0
     self.finish_list[(movie_id, start + 60)] = 0
     self.finish_list[(movie_id, start + 80)] = 0
     self.checkpoint()

示例#12

0

显示文件

 def search_goods_once(self, goods_name, index):
     if not os.path.exists('%scookie_alimama' % data_dir):
         print('alimama cookie not exist!!!')
         return
     with codecs.open('%scookie_alimama' % data_dir, 'r',
                      encoding='utf-8') as f:
         cookie = f.readlines()
     url_list = [
         'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=',
         cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=',
         str(int(round(time.time() * 1000))), '&_t=',
         str(int(round(time.time() * 1000))), '&q=', goods_name
     ]
     headers = {
         'X-Requested-With': 'XMLHttpRequest',
         'Cookie': '',
         'Content-Type': get_content_type(),
         'Accept': get_accept('xhr'),
     }
     headers['Cookie'] = cookie[0][:-1]
     ca = basic_req(''.join(url_list), 2, header=headers)
     if ca.status_code != 200 or not 'data' in ca.json():
         if can_retry(''.join(url_list)):
             self.search_goods_once(goods_name, index)
         return
     page_list = ca.json()['data']['pageList']
     title = [
         '||'.join(
             [str(index['auctionId']), goods_name,
              str(index['zkPrice'])]) for index in page_list
     ][0]
     self.goods_name[index] = title
     print(title)

示例#13

0

显示文件

    def bulk_import_alimama_once(self, index, group_id):
        """
        bulk import alimama
        """

        url = 'http://pub.alimama.com/favorites/item/batchAdd.json'
        if not os.path.exists('%scookie_alimama' % data_dir):
            print('alimama cookie not exist!!!')
            return
        with codecs.open('%scookie_alimama' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readlines()

        goods_len = len(self.goods_candidate)
        begin_id = index * 200
        end_id = min(goods_len, (index + 1) * 200)

        goods_ids = self.goods_candidate[begin_id:end_id]
        update_data = {
            'groupId': group_id,
            'itemListStr': ','.join(goods_ids),
            't': str(int(round(time.time() * 1000))),
            '_tb_token_': cookie[1][:-1],
            'pvid': cookie[2][:-1]
        }
        print(update_data)
        cb = basic_req(url, 12, data=update_data, header=headers)
        if cb.status_code == 200 and cb.json()['info']['message'] != 'nologin':
            print(cb.json()['data'])

示例#14

0

显示文件

文件： bsocket.py 项目： onlyoneprogram/spider

 def get_cid(self, bv_id: str):
     playlist_url = self.PLAYLIST_URL % bv_id
     headers = {"Accept": "*/*", "Referer": self.ROOM_INIT_URL % bv_id}
     req = basic_req(playlist_url, 1, header=headers)
     if req is None or list(req.keys()) != self.JSON_KEYS:
         return
     cid = [ii["cid"] for ii in req["data"]]
     return cid

示例#15

0

显示文件

文件： getproxy.py 项目： onlyoneprogram/spider

 def get_download(self, types: str):
     url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types
     tt = basic_req(url, 1)
     if tt is None:
         return []
     tt_list = tt[0]["LISTA"]
     echo(1, "Get download", types, len(tt_list))
     return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list]

示例#16

0

显示文件

文件： getproxy.py 项目： onlyoneprogram/spider

 def get_free_proxy(self, url: str):
     req = basic_req(url, 2)
     if req is None:
         return []
     tt = req.text
     t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>",
                         tt)
     echo(1, "Get Free proxy List", url, len(t_list))
     return ["{}:{}".format(ii, jj) for ii, jj in t_list]

示例#17

0

显示文件

    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result

示例#18

0

显示文件

    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result

示例#19

0

显示文件

 def get_ynote_file(self, offset: int = 0):
     url = self.LISTRECENT_URL % (offset, self.cstk)
     data = {"cstk": self.cstk}
     req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1))
     if req is None or type(req) != list:
         return None
     list_recent = {ii["fileEntry"]["id"]: ii["fileEntry"] for ii in req}
     self.list_recent = {**self.list_recent, **list_recent}
     echo(1, "Load ynote file {} items.".format(len(self.list_recent)))
     return req

示例#20

0

显示文件

    def basic_view(self, url: str, times: int, types: int):
        ''' press have no data input '''
        url = self.AV_URL
        if types == 1:
            html = proxy_req(url, 0)
        else:
            html = basic_req(url, 0)

        if html == False and times < 5:
            self.basic_view(url, times + 1, types)

示例#21

0

显示文件

 def get_share_info(self, share_id: str):
     changeJsonTimeout(4)
     url = self.GET_SHARE_URL % share_id
     headers = self.get_tb_headers(self.Y_URL)
     req = basic_req(url, 1, header=headers)
     if req is None:
         return
     info = req["entry"]
     self.share2article[share_id] = (info["name"].replace('.note', ''), info["id"], info["lastUpdateTime"])
     return req

示例#22

0

显示文件

文件： hotelDetail.py 项目： luyu344/spider

    def user_action(self, hotel_id: int = 4889292):

        url = '{}hotel/{}.html'.format(HOTELS_URL, hotel_id)
        text = basic_req(url, 3)
        page_id = int(re.findall(r'id="page_id" value="(\d*?)" />', text)[0])
        correlation_id = re.findall(r'relationId" value="(\d*?)"/>', text)[0]

        e = self.login_cookie()['_bfa'].split('.')
        common = [
            page_id, e[1] + '.' + e[2],
            int(e[6]),
            int(e[7]), correlation_id, "M:70,181023_hod_fxtj:B;", '', '2.6.9',
            "vq5tkk-ufpyck-qsxbg3", "", "", "", "", "", "online"
        ]
        _queue = [{
            'action':
            'click',
            'xpath':
            "HTML/BODY[@id='mainbody']/FORM[@id='aspnetForm']/DIV[3][@id='base_bd']/DIV[4]/DIV[@id='divDetailMain']/DIV[9][@id='id_room_select_box']/DIV[2]/DIV/DIV/A[@id='changeBtn'][@x='{}'][@y='{}'][@rx='{}'][@ry='{}']"
            .format(random.randint(50, 80), random.randint(650, 750),
                    random.randint(20, 40), random.randint(5, 20)),
            'ts':
            int(time.time() * 1000),
        }]
        ee = [[2, "useraction"], common, _queue]
        eee = json.dumps(ee, separators=(',', ':'))
        print(eee)
        compress = execjs.compile(open(compress_path).read())
        eeee = compress.call('compress', eee)
        echo(2, eeee)
        cookie = {'uid': 'Yn17vOkRm2gW+jCNwT8jPg=='}
        header = {
            'Referer':
            'https://hotels.ctrip.com/hotel/4889292.html',
            'Cookie':
            self.encoder_cookie(cookie),
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3777.0 Safari/537.36',
        }
        url = 'https://s.c-ctrip.com/bf.gif?ac=a&d={}&jv=1.0.0'.format(eeee)
        req = basic_req(url, 2, header=header)
        echo(0, req.cookies.get_dict())

示例#23

0

显示文件

 def decoder_tpwd(self, tpwd: str):
     """ decoder the tpwd from taokouling """
     url = self.DECODER_TPWD_URL % (self.api_key, tpwd)
     req = basic_req(url, 1)
     if (
         req is None
         or isinstance(req, str)
         or 'ret' not in list(req.keys())
     ):
         return {}
     return req

示例#24

0

显示文件

文件： press.py 项目： zhujuanzhu/spider

    def basic_press(self, url, times, types):
        """
        press have no data input
        """
        url = url + str(int(round(time.time() * 1000)))
        if types == 1:
            html = proxy_req(url, 1)
        else:
            html = basic_req(url, 1)

        if html == False and times < 5:
            self.basic_press(url, times + 1, types)

示例#25

0

显示文件

文件： getproxy.py 项目： onlyoneprogram/spider

 def get_api(self):
     API_KEY = "xxx"
     url = "http://api.scraperapi.com/?api_key={}&url=http://httpbin.org/ip".format(
         API_KEY)
     t_list = []
     for ii in range(38):
         tt = basic_req(url, 1)
         if tt is None:
             continue
         t_list.append(tt["origin"])
     echo(1, "Get scraperapi", len(t_list))
     return t_list

示例#26

0

显示文件

    def get_goods_second(self, url, index):

        second_result = basic_req(url, 0, header=self.headers)
        # second_result = proxy_req(url, 0)

        if not second_result or not len(second_result.find_all('input')):
            if can_retry(url):
                self.get_goods_second(url, index)
            return
        goods_id = second_result.find_all('input')[6]['value']
        print(goods_id)
        self.goods_map[index] = goods_id

示例#27

0

显示文件

文件： getproxy.py 项目： Marin111/spider-1

 def request_text(self, url):
     ''' requests text '''
     req = basic_req(url, 2)
     if req is None:
         echo(0, url)
         if can_retry(url):
             self.request_text(url)
         else:
             return ''
     else:
         echo(1, url)
         return req.text

示例#28

0

显示文件

 def load_city_list(self):
     ''' load city list '''
     text = basic_req(self.MDD_URL, 3)
     city_list = re.findall(
         '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(</a>|<span)',
         text)
     id2map = {
         int(ii[0]): ii[1].strip()
         for ii in city_list if ii[0].isdigit()
     }
     city_list = id2map.keys()
     self.city_list = city_list
     self.id2map = id2map

示例#29

0

显示文件

文件： getproxy.py 项目： onlyoneprogram/spider

    def judge_url(self,
                  urls: str,
                  index: int,
                  times: int,
                  ss_test: bool = False):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == "s"
        proxies = {type_map[http_type]: urls}

        test_url = (type_map[http_type] +
                    "://music.163.com/api/playlist/detail?id=432853362")
        ss_url = "https://www.google.com/?gws_rd=ssl"
        try:
            data = basic_req(test_url, 1, proxies)
            result = data["result"]
            tracks = result["tracks"]
            if len(tracks) == 10:
                if times < 0:
                    self.judge_url(urls, index, times + 1)
                else:
                    echo("1|debug", urls, proxies, "Proxies can use.")
                    self.canuse_proxies.append(urls)
                    self.can_use_ip[index] = [urls, int(http_type)]
                    if ss_test:
                        data = basic_req(ss_url, 0)
                        if len(str(data)) > 5000:
                            self.can_use_ip[index] = [urls, int(http_type) + 2]
            else:
                echo("0|debug", urls, proxies, "Tracks len error ^--<^>--^ ")
                self.cannot_use_ip[index] = urls
        except:
            echo("0|debug", urls, proxies, "return error [][][][][][]")
            if not index in self.can_use_ip:
                self.cannot_use_ip[index] = urls

示例#30

0

显示文件

 def load_av_lists(self):
     url = self.MEMBER_SUBMIT_URL % self.assign_up_mid
     json_req = basic_req(url, 1)
     if json_req is None or not 'data' in json_req or not 'vlist' in json_req[
             'data']:
         if can_retry(url):
             self.load_av_lists()
         return
     av_id_map = {ii['aid']: ii for ii in json_req['data']['vlist']}
     if self.basic_av_id not in av_id_map:
         if can_retry(url):
             self.load_av_lists()
         return
     self.av_id_map = av_id_map