Пример #1
0
    def load_spot_once(self, pn=1, city_id=10186):
        ''' load spot once '''
        data = {
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': city_id,
            'iTagId': 0,
            'iPage': pn,
        }
        data = self.load_sn(data)
        print(data)
        req = get_request_proxy(self.AJAX_ROUTER_URL, 11, data=data)
        if req is None or not 'data' in req or not 'list' in req['data']:
            if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)):
                self.load_spot_once(pn, city_id)
            return
        spot_list = req['data']['list']
        spot_pn = req['data']['page']
        spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list)
        try:
            total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0])
        except Exception as e:
            total_pn = 1
            echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e)

        if city_id not in self.spot_result:
            self.spot_result[city_id] = spot_tmp
        else:
            self.spot_result[city_id] += spot_tmp
        self.spot_pn[city_id] = total_pn
Пример #2
0
    def load_gather(self):
        """
        load gather proxy pool text
        If failured, you should reactive the cookie.
        """
        headers = {
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'Host': 'www.gatherproxy.com',
            'Origin': 'http://www.gatherproxy.com',
            'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding": "",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        url = 'http://www.gatherproxy.com/subscribe/infos'
        sid_url_req = requests.get(url, headers=headers, verify=False)
        sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser')
        sid_url = sid_url_html.find_all('div', class_='wrapper')[1].find_all('a')[0]['href']
        if len(sid_url.split('sid=')) < 2:
            echo(0, 'cookie error')
            self.get_cookie()
            self.load_gather()
            return
        sid = sid_url.split('sid=')[1]
        sid_url = 'http://www.gatherproxy.com' + sid_url

        data = {'ID':sid , 'C': '', 'P': '', 'T': '', 'U': '0'}
        gatherproxy = requests.post(sid_url, headers=headers, data=data,verify=False)
        with codecs.open(data_dir + 'gatherproxy', 'w', encoding='utf-8') as f:
            f.write(gatherproxy.text)
Пример #3
0
 def js_compile_sn(self, prepare_map):
     ''' js compile sn '''
     wait_js = '<script>' + self.result_js + '</script>'
     sn = self.js_compile.call('analysis_js', wait_js, self.slat,
                               prepare_map)
     echo(2, '_sn', sn)
     return sn
Пример #4
0
    def prepare_js(self):
        ''' prepare js '''
        pre_text = basic_req(self.JD_URL, 3)
        INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t',
                                  pre_text)[0].split('"')[1]
        origin_js = basic_req(INDEX_JS_URL, 3)
        ''' decoder js '''
        decode_js = codecs.unicode_escape_decode(origin_js)[0]
        ''' params replace '''
        replace_list_str = decode_js.split(';')[2]
        empty_index = replace_list_str.index(' ') + 1
        begin_index = replace_list_str.index('=[') + 2
        end_index = replace_list_str.index(']')
        replace_list = replace_list_str[begin_index:end_index].split(',')
        rp = replace_list_str[empty_index:begin_index - 2]
        for ii, jj in enumerate(replace_list):
            decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj)
        self.slat = replace_list[46].replace('"', '')
        echo(2, 'salt', self.slat)
        ''' load to local '''
        with open(decoder_js_path, 'w') as f:
            f.write(';\n'.join(decode_js.split(';')))
        ''' del function about ajax '''
        del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js)
        del_begin_index = decode_js.index(del_str[0])

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]
        self.result_js = result_js
        self.js_compile = execjs.compile(open(hotel_js_path).read())
        echo(1, 'Load hotel index js success!!!')
Пример #5
0
    def dbcanuseproxy(self):
        """
        test db have or not this data
        """

        results = self.selectproxy([ii[0] for ii in self.canuseip.values()])
        ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1])
        echo(2, "SS proxies %d"%ss_len)

        insertlist = []
        updatelist = []
        ipmap = {}
        if results != False:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.canuseip.values():
                http_type = ip_now[1]
                ip_now = ip_now[0]
                if ip_now in ipmap:
                    if ipmap[ip_now][1]:
                        updatelist.append(
                            (ipmap[ip_now][0], ip_now, http_type, 0))
                else:
                    insertlist.append((ip_now, http_type))
            if len(insertlist):
                self.insertproxy(insertlist)
            if len(updatelist):
                self.updateproxy(updatelist, 0)
        else:
            pass
        self.canuseip = {}
Пример #6
0
    def initproxy(self):
        """
        init proxy list
        """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if results != 0:

            for index in results:
                if index[1] == 1:
                    self.proxylists.append(index[0])
                elif index[1] == 2:
                    self.proxylist.append(index[0])
                    self.proxylist_ss.append(index[0])
                elif index[1] == 3:
                    self.proxylists.append(index[0])
                    self.proxylists_ss.append(index[0])
                else:
                    self.proxylist.append(index[0])
            echo(2, len(self.proxylist), ' http proxy can use.')
            echo(2, len(self.proxylists), ' https proxy can use.')
            echo(2, len(self.proxylist_ss), ' ss http proxy can use.')
            echo(2, len(self.proxylists_ss), ' ss https proxy can use.')
        else:
            echo(0, 'Please check db configure!!! The proxy pool cant use!!!>>>')
Пример #7
0
 def v(eval_func,type):
     u.echo(f,f'({type}) eval:',end='',flush=True)
     t_SAC, b_SAC = eval_func(evidence,tac_posteriors)
     u.echo(f,f' {t_SAC:.2f} sec'
              f'\n  {1000*t_SAC/bsize:.0f} ms per example, used batch size {b_SAC}'
              f'\n  {t_SAC/t_AC:.2f} {type}/ac ')
     return t_SAC, b_SAC
Пример #8
0
    def parse_detail(self):
        ''' parse hotel detail '''

        version = begin_time()
        text = self.get_hotel_detail()
        html = BeautifulSoup(text['html'], 'html.parser')
        trs = html.findAll('tr')[2:]
        hotel_detail = []

        for tr in trs:
            room_name = re.findall('baseroomname="(.*?)"', str(tr))
            if not len(room_name):
                room_name = re.findall('rel="nofollow">\n(.*?)\n', str(tr))
            room_name = room_name[0].strip() if len(
                room_name) else hotel_detail[-1][0]
            price = re.findall(r'</dfn>(\d{4,5}?)</span>', str(tr))
            if not len(price):
                continue
            else:
                price = price[0]
            price_type = re.findall('room_type_name">(.*?)</span>', str(tr))[0]
            if 'em' in price_type:
                price_type = ','.join([
                    *re.findall('(.*?)<em', price_type),
                    *re.findall('((.*?))', price_type)
                ])
            hotel_detail.append([room_name, price_type, price])
        output_dir = '{}hotelDetail.txt'.format(data_dir)
        with open(output_dir, 'w') as f:
            f.write('\n'.join([','.join(ii) for ii in hotel_detail]))
        echo(
            1, 'Load {} price\nOutput path: {}\nSpend time: {:.2f}s'.format(
                len(hotel_detail), output_dir, end_time(version, 0)))
        return hotel_detail
Пример #9
0
 def get_other_proxies(self, url):
     ''' get other proxies '''
     text = self.request_text(url)
     pages = re.findall(r'<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>', '' if text is None else text)
     if not len(pages ):
         echo(0, 'Please do not frequently request {}!!!'.format(url))
     else:
         proxies = [re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', self.request_text(ii)) for ii in pages]
         self.waitjudge = [*self.waitjudge, *sum(proxies, [])]
Пример #10
0
 def insertproxy(self, insertlist):
     """
     insert data to db
     """
     results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1])
     if results:
         echo(2, 'Insert ' + str(len(insertlist)) + ' items Success!')
     else:
         pass
Пример #11
0
 def load_proxies_test(self):
     ''' load mode & test proxies '''
     start = time.time()
     self.load_proxies_list()
     proxies_len = len(self.waitjudge)
     self.threadjude()
     canuse_len = len(self.canuse_proxies)
     echo(1, '\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {:.2f}s\n'.format(proxies_len, canuse_len,time.time() - start))
     with open('{}canuse_proxies.txt'.format(data_dir), 'w') as f:
         f.write('\n'.join(self.canuse_proxies))
Пример #12
0
    def get_request_proxy(self, url:str, types:int, data=None, test_func=None, header=None):
        """
        use proxy to send requests, and record the proxy cann't use
        @types S0XY: X=0.->get;   =1.->post;
                     Y=0.->html;  =1.->json; =2.->basic
                     S=0.->basic ;=1.->ss

        support failured retry && failured auto record
        """

        httptype = url[4] == 's'
        ss_type = types // 1000
        types %= 1000
        if ss_type:
            proxylist = self.proxylists_ss if httptype else self.proxylist_ss
        else:
            proxylist = self.proxylists if httptype else self.proxylist

        if not len(proxylist):
            if self.Db.db:
                echo(0, 'Proxy pool empty!!! Please check the db conn & db dataset!!!')
            proxies = {}
        else:
            index = random.randint(0, len(proxylist) - 1)
            proxies_url = proxylist[index]
            proxies = {type_map[httptype]: proxies_url}

        try:
            result = basic_req(url, types, proxies, data, header)
            if not test_func is None:
                if not test_func(result):
                    if self.check_retry(url):
                        self.get_request_proxy(
                            url, types + 1000 * ss_type, data, test_func)
                    else:
                        self.failuredtime[url] = 0
                        return
                else:
                    return result
            else:
                return result

        except:
            self.cannotuseip[random.randint(0, MAXN)] = proxies_url

            if proxies_url in proxylist:
                proxylist.remove(proxylist.index(proxies_url))

            if not len(self.cannotuseip.keys()) % 10:
                self.cleancannotuse()

            if self.check_retry(url):
                self.get_request_proxy(url, types + 1000 * ss_type, data, test_func)
            else:
                return
Пример #13
0
    def updateproxy(self, updatelist, types):
        """
        update data to db
        """

        results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1])
        typemap = {0: 'can use ', 1: 'can not use '}
        if results:
            echo(2, 'Update', typemap[types],str(len(updatelist)),' items Success!')
        else:
            pass
Пример #14
0
 def request_text(self, url):
     ''' requests text '''
     req = basic_req(url, 2)
     if req is None:
         echo(0, url)
         if can_retry(url):
             self.request_text(url)
         else:
             return ''
     else:
         echo(1, url)
         return req.text
Пример #15
0
 def get_hotel_detail(self):
     ''' get hotel detail '''
     params = {
         **self.generate_other_params(), 'callback':
         self.generate_callback(16),
         'eleven': self.generate_eleven(),
         '_': int(time.time() * 1000)
     }
     params_list = [
         '{}={}'.format(ii, (jj if not jj is None else ''))
         for ii, jj in params.items()
     ]
     url = '{}?{}'.format(HOTEL_ROOMLIST_FOR_DETAIL_URL,
                          '&'.join(params_list))
     echo(2, 'XHR url', url)
     text = basic_req(url, 1)
     return text
Пример #16
0
    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                echo(2,str(index) + ' ' + str(pageindex))
                work = threading.Thread(
                    target=self.sixsixthread, args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)
Пример #17
0
 def load_proxies_list(self, types=2):
     ''' load proxies '''
     SITES = ['http://www.proxyserverlist24.top/', 'http://www.live-socks.net/']
     spider_pool = []
     self.waitjudge = []
     for site in SITES:
         self.get_other_proxies(site)
     if os.path.exists('{}gatherproxy'.format(data_dir)):
         self.gatherproxy(3)
     waitjudge = list(set(self.waitjudge))
     waitjudge_http = ['http://' + ii for ii in waitjudge]
     waitjudge_https = ['https://' + ii for ii in waitjudge]
     if not types:
         self.waitjudge = waitjudge_http
     elif types == 1:
         self.waitjudge = waitjudge_https
     else:
         self.waitjudge = (waitjudge_http + waitjudge_https)
     echo(1, '-_-_-_-_-_-_-', len(waitjudge), 'Proxies wait to judge -_-_-_-_-_-_-')
Пример #18
0
    def judgeurl(self, urls, index, times, ss_test=False):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {type_map[http_type]: urls}

        test_url = type_map[http_type] + '://music.163.com/api/playlist/detail?id=432853362'
        ss_url = 'https://www.google.com/?gws_rd=ssl'
        try:
            data = basic_req(test_url, 1, proxies)
            result = data['result']
            tracks = result['tracks']
            if len(tracks) == 56:
                if times < 0:
                    self.judgeurl(urls, index, times + 1)
                else:
                    echo(1, urls, proxies, 'Proxies can use.')
                    self.canuse_proxies.append(urls)
                    self.canuseip[index] = [urls, int(http_type)]
                    if ss_test:
                        data = basic_req(ss_url, 0)
                        if len(str(data)) > 5000:
                            self.canuseip[index] = [urls, int(http_type) + 2]
            else:
                echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ')
                self.cannotuseip[index] = urls
        except:
            echo(0, urls, proxies, 'return error [][][][][][]')
            if not index in self.canuseip:
                self.cannotuseip[index] = urls
            pass
Пример #19
0
    def get_cookie(self):
        """
        make cookie login
        PS: Though cookie expired time is more than 1 year,
            but It will be break when the connect close.
            So you need reactive the cookie by this function.
        """
        headers = {
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'Host': 'www.gatherproxy.com',
            'Origin': 'http://www.gatherproxy.com',
            'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding": "",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        login_url = 'http://www.gatherproxy.com/subscribe/login'

        cookie_html = basic_req(login_url, 0,header=headers)
        verify_text = cookie_html.find_all('div', class_='label')[2].span.text
        verify_list = verify_text.replace('= ','').strip().split()
        num_map = {'Zero': 0,'One': 1,'Two': 2, 'Three':3,'Four':4,'Fine':5,'Six':6,'Seven':7,'Eight': 8, 'Nine':9, 'Ten': 10}
        verify_num = [verify_list[0], verify_list[2]]
        for index, num in enumerate(verify_num):
            if num.isdigit():
                verify_num[index] = int(num)
            elif num in num_map:
                verify_num[index] = num_map[num]
            else:
                echo(0, 'Error', index)
                # return False
        verify_code = 0
        error = True

        operation = verify_list[1]
        if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied':
            verify_code = verify_num[0] + verify_num[1]
            error = False
        if operation == '-' or operation == 'minus':
            verify_code = verify_num[0] - verify_num[1]
            error = False
        if operation == 'X' or operation == 'multiplication':
            verify_code = verify_num[0] * verify_num[1]
            error = False
        if error:
            echo(0, 'Error', operation)
        if not os.path.exists('%spassage'%data_dir):
            echo(0, 'gather passage not exist!!!')
            return
        with codecs.open('%spassage'%data_dir, 'r', encoding='utf-8') as f:
            passage = [index[:-1] for index in f.readlines()]
        data = {'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code)}
        time.sleep(2.163)
        r = requests.session()
        r.cookies = cj.LWPCookieJar()
        login_req = r.post(login_url, headers=headers, data=data, verify=False)
Пример #20
0
    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            echo(0, "Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url%(index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(
                    ip + '://' + tds[1].text + ':' + tds[2].text)
        self.threadjude()
        end_time(version)
Пример #21
0
 def gatherproxy(self, types):
     """
     :100: very nice website
     first of all you should download proxy ip txt from:
     http://www.gatherproxy.com/zh/proxylist/country/?c=China
     """
     if not os.path.exists('{}gatherproxy'.format(data_dir)):
         echo(0, 'Gather file not exist!!!')
         return
     with codecs.open('{}gatherproxy'.format(data_dir), 'r', encoding='utf-8') as f:
         file_d = [ii.strip() for ii in f.readlines()]
     waitjudge_http = ['http://' + ii for ii in file_d]
     waitjudge_https = ['https://' + ii for ii in file_d]
     if not types:
         self.waitjudge += waitjudge_http
     elif types ==1:
         self.waitjudge += waitjudge_https
     elif types == 2:
         self.waitjudge += (waitjudge_http + waitjudge_https)
     else:
         self.waitjudge += file_d
     echo(2, 'load gather over!')
Пример #22
0
    def load_spot(self, batch_size=50):
        ''' load spot '''
        version = begin_time()
        self.load_city_list()
        # self.city_list = [10186]
        city_threading = [
            threading.Thread(target=self.load_spot_once, args=(
                1,
                ii,
            )) for ii in self.city_list
        ]
        shuffle_batch_run_thread(city_threading, 150)

        spot_continue = []
        for ii, jj in self.spot_pn.items():
            spot_continue += [
                threading.Thread(target=self.load_spot_once, args=(
                    pn,
                    ii,
                )) for pn in range(2, jj + 1)
            ]

        shuffle_batch_run_thread(spot_continue, 150)
        output = [
            '{},{}'.format(self.id2map[ii], ','.join(jj))
            for ii, jj in self.spot_result.items()
        ]
        output_path = '{}spot.txt'.format(data_dir)
        with open(output_path, 'w') as f:
            f.write('\n'.join(output))
        city_num = len(self.city_list)
        spot_num = sum([len(ii) for ii in self.spot_result.values()])
        echo(
            1,
            'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'
            .format(city_num, spot_num, output_path, end_time(version, 0)))
Пример #23
0
def eval_all(sizes,output,testing):
    circuit_type = 'TAC' if testing else 'AC'
    fname = paths.exp / u.time_stamp(f'eval_rect_{output}_{testing}','txt')
    f     = open(fname,'w+')
    u.echo(f,f'\n===Rectangle: evaluating {circuit_type} for {output}===')
    u.echo(f,'output logged into logs/exp/')
    start_time = time.time()
    for size in sizes: 
        eval(f,size,output,testing)
    all_time = time.time() - start_time
    u.echo(f,f'\nTotal Time: {all_time:.3f} sec') 
    f.close()
Пример #24
0
def test_tw_reduction(ssize):

    saved = u.verbose
    u.verbose = False
    
    # vcount: number of network vars
    # scount: max number of values per var
    # pcount: max number of parents per var
    
    counts = (75,2,4), (100,3,5) # vcount, scount, pcount
    fperct = (1/4,1/2,2/3,4/5)   # percentage of functional vars
    
    fname = f'TW{ssize}_C{counts}_P{fperct}'
    fname = paths.exp / u.time_stamp(fname,'txt')
    f     = open(fname,'w+')
    
    u.echo(f,'\n===Reduction in TreeWidth')
    u.echo(f,f'sample size {ssize}')
    u.echo(f,'output logged into logs/exp/')
    
    start_time = time.perf_counter()
    
    for vcount, scount, pcount in counts:
        back = vcount - 1
        for functional_fraction in fperct: 
            fcount = int(vcount * functional_fraction)
            w1_sample = []
            w2_sample = []
            for _ in range(ssize):
                bn, _, _ = rbn.get(vcount,scount,pcount,fcount,back,testing=False)
                bn1      = bn.copy_for_inference()
                bn2, _, (w1,w2) = decouple.get(bn1,[],False,'minfill',None)
                w1_sample.append(w1)
                w2_sample.append(w2)
                
            reduction = [w1-w2 for w1,w2 in zip(w1_sample,w2_sample)]

            rd_mean, rd_stdev = s.mean(reduction), s.stdev(reduction)
            w1_mean, w1_stdev = s.mean(w1_sample), s.stdev(w1_sample)
            w2_mean, w2_stdev = s.mean(w2_sample), s.stdev(w2_sample)
            
            u.echo(f,f'\n== vcount {vcount}, scount {scount}, pcount {pcount}, fcount {functional_fraction:.2f}, ')
            u.echo(f,f'before mean {w1_mean:.1f} stdev {w1_stdev:.1f}')
            u.echo(f,f'after  mean {w2_mean:.1f} stdev {w2_stdev:.1f}')
            u.echo(f,f'reduce mean {rd_mean:.1f} stdev {rd_stdev:.1f}')
    
    all_time = time.perf_counter() - start_time
    u.echo(f,f'\n===Total Time: {all_time:.3f} sec') 
    f.close()
    v.verbose = saved
Пример #25
0
    def generate_eleven(self):
        ################################################################
        #
        #   [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan
        #
        #   1. random generate 15 bit param `callback`;
        #   2. use callback request OCEANBALL -> get origin js;
        #   3. eval once -> (match array, and then chr() it) -> decoder js;
        #   4. replace document and windows(you also can use execjs & jsdom);
        #   5. warning you should replace `this` to some params,
        #      Otherwise, you will get `老板给小三买了包, 却没有给你钱买房`
        #   6. finsh, return, and joint params;
        #
        ################################################################

        callback = self.generate_callback(15)
        now_time = int(time.time() * 1000)
        url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time)
        referer_url = HOTEL_DETAIL_URL % self.default_hotel_id
        changeHeaders({'Referer': referer_url})
        oceanball_js = basic_req(url, 3)
        array = re.findall(r'\(\[(.*)\],', oceanball_js)[0].split(',')
        array = [int(ii) for ii in array]
        offset = int(re.findall(r'item-(\d*?)\)', oceanball_js)[0])
        ''' String.fromCharCode '''
        oe = ''.join([chr(ii - offset) for ii in array])
        ''' replace window[callback] callback function '''
        replace_str = re.findall(r'{}\(new.*\)\);'.format(callback), oe)[0]
        eleven_params = re.findall(
            r'{}\(new.*\+ (.*?) \+.*\)\);'.format(callback), oe)[0]
        replaced_str = 'return {};'.format(eleven_params)
        oe = oe.replace(replace_str, replaced_str)
        oe = oe.replace('\'', '"').replace('\r', '')
        oe = oe.replace(';!', 'let aaa = ', 1)

        replace = '''
        function(){let href='https://hotels.ctrip.com/hotel/4889292.html';
            a={'documentElement': {'attributes':{}}};
            b={};
            function c(){};
            userAgent ='Chrome/73.0.3682.0';
            geolocation = 0;
        '''
        ''' replace document & windown & navigator '''
        oe = oe.replace('document.body.innerHTML.length',
                        '888').replace('document.body.innerHTML', '""')
        oe = oe.replace('document.createElement("div")', '{}')
        oe = oe.replace('window.HTMLSpanElement',
                        'c').replace('document.createElement("span")', '1')
        oe = oe.replace('window.location.href',
                        'href').replace('location.href', 'href')
        oe = oe.replace('navigator.', '')
        oe = oe.replace('new Image().', '')
        oe = oe.replace('document.all', '0').replace('document.referrer', '""')
        oe = oe.replace('this || ', '')
        oe = oe.replace('window["document"]', 'a')

        oe = oe.replace('document', 'a').replace('window', 'b')
        oe = oe.replace('function(){', replace, 1)
        ''' eval script '''
        eleven = js2py.eval_js(oe)
        echo(1, 'eleven', eleven)
        return eleven
Пример #26
0
def __posterior_time(f,bn,inputs,output,bsize,min_ac,max_ac,counter):
    s_time = time.perf_counter()
    AC = tac.TAC(bn,inputs,output)
    t = time.perf_counter()-s_time
    
    if AC.size < min_ac*1000000 or AC.size > max_ac*1000000: 
        return None
    
    u.echo(f,f'\n== {counter} ==\nTensor AC:',end='')
    u.echo(f,f' {t:.1f} sec')
    u.echo(f,f'  size {AC.size:,}, max binary rank {AC.binary_rank:0.1f}')
    
    # get evidence
    cards          = tuple(bn.node(input).card for input in inputs)
    evidence       = data.evd_random(bsize,cards)
    
    # evaluate AC as tf graph with batch
    u.echo(f,f'(tf full) eval:',end='',flush=True)
    tac_posteriors, t_AC, b_AC = AC.evaluate(evidence,report_time=True)    
    u.echo(f,f' {t_AC:.2f} sec'
             f'\n  {1000*t_AC/bsize:.0f} ms per example, used batch size {b_AC}'
             f'\n  {1000*t_AC/bsize/(AC.size/1000000):.0f} ms per 1M nodes (one example)')
   
    # check classical AC and numpy
    AC_size  = AC.size
    AC_brank = AC.binary_rank
    opsgrapy = AC.ops_graph
    del AC # no longer needed

    u.echo(f,'\nScalar AC:',end='')
    s_time = time.perf_counter()
    SAC = verify.AC.ScalarAC(opsgrapy)
    t = time.perf_counter()-s_time
    u.echo(f,f' {t:.1f} sec')
    u.echo(f,f'  size {SAC.size:,}')
    u.echo(f,f'  {SAC.size/AC_size:.2f} scalar ac/tensor ac')

    def v(eval_func,type):
        u.echo(f,f'({type}) eval:',end='',flush=True)
        t_SAC, b_SAC = eval_func(evidence,tac_posteriors)
        u.echo(f,f' {t_SAC:.2f} sec'
                 f'\n  {1000*t_SAC/bsize:.0f} ms per example, used batch size {b_SAC}'
                 f'\n  {t_SAC/t_AC:.2f} {type}/ac ')
        return t_SAC, b_SAC

    t_numpy, b_numpy = v(SAC.verify_numpy,'numpy batch')
#    t_tf, b_tf    = v(SAC.verify_tf,'tf batch')
    t_tf, b_tf    = 0, 0
#    t_array, b_array = v(SAC.verify_array,'array')
    t_array, b_array = 0, 0
    
    return (AC_size, AC_brank, SAC.size, t_AC, t_numpy, t_tf, t_array, b_AC, b_numpy, b_tf, b_array)
Пример #27
0
def test_eval_time(ssize,bsize,min_ac,max_ac,vc,sc,pc): 
    saved = u.verbose
    u.verbose = False
       
    fcount = vc // 2 # number of vars with functional cpt
    back   = vc - 1          
    
    fname = (f'RBN_S{ssize}_B{bsize}_'
             f'C{min_ac}_{max_ac}_BN_'
             f'{vc}_{sc}_{pc}_{fcount}_{back}')
    fname = paths.exp / u.time_stamp(fname,'txt')
    f     = open(fname,'w+')
    u.echo(f,f'\n===Evaluation time for random bayesian networks===\n')
    u.echo(f,f'{vc} vars, {sc} values, {pc} parents, '
             f'{fcount} functional vars (no roots), {back} back'
             f'\n{ssize} circuits, '
             f'size {min_ac}-{max_ac}M'
             f'\n{bsize} examples')
    u.echo(f,'output logged into logs/exp/')
    start_time = time.perf_counter()
    
    # stats
    s_AC, r_AC, s_SAC            = [], [], []
    t_AC, t_numpy, t_tf, t_array = [], [], [], []
    b_AC, b_numpy, b_tf, b_array = [], [], [], []
    def process(result):
        s, r, s2, tac, tnumpy, ttf, tarray, bac, bnumpy, btf, barray = result
        
        s_AC.append(s)
        r_AC.append(r)
        s_SAC.append(s2)
        
        t_AC.append(tac)
        t_numpy.append(tnumpy)
        t_tf.append(ttf)
        t_array.append(tarray)
        
        b_AC.append(bac)
        b_numpy.append(bnumpy)
        b_tf.append(btf)
        b_array.append(barray)
    
    success = 0
    while success < ssize:
        bn, inputs, outputs = rbn.get(vc,sc,pc,fcount,back,testing=False)
        i = np.random.choice(inputs)
        o = np.random.choice(outputs)
        
        result = __posterior_time(f,bn,inputs,o,bsize,min_ac,max_ac,success) # causal
        if result:
            success += 1
            process(result)
        result = __posterior_time(f,bn,outputs,i,bsize,min_ac,max_ac,success) # evidential
        if result:
            success += 1
            process(result)
    assert len(s_AC) == ssize
            
    # summary stats
    # eval time for ac per one million nodes and one example
    ac_per_mill = [1000*t/bsize//(s/1000000) for t,s in zip(t_AC,s_AC)]
    # size of largest tensor in ac (2** max binary rank)
    ac_max_rank = r_AC
    # comparing tensor and scalar ac size
    sac_ac      = [s1/s2 for s1,s2 in zip(s_SAC,s_AC)]
    # comparing ac eval time with others
    numpy_ac    = [t1/t2 for t1,t2 in zip(t_numpy,t_AC)]
    tf_ac       = [t1/t2 for t1,t2 in zip(t_tf,t_AC)]
    array_ac    = [t1/t2 for t1,t2 in zip(t_array,t_AC)]
    
    u.echo(f,f'\n==\nsummary stats ({ssize} circuits, {bsize} examples, '
             f'size {min_ac}-{max_ac}M)')
    u.echo(f,f'  ac  size: mean {int(s.mean(s_AC)):,}, stdev {int(s.stdev(s_AC)):,}, '
             f'min {min(s_AC):,}, max {max(s_AC):,}')
    u.echo(f,f'  ac brank: mean {s.mean(ac_max_rank):.1f}, stdev {s.stdev(ac_max_rank):.1f}')
    u.echo(f,f'  sac/ac size: mean {s.mean(sac_ac):.2f}, stdev {s.stdev(sac_ac):.2f}')
    
    # used batch size may be different from evidence size due to memory limitations
    u.echo(f,f'\nused batch size')
    u.echo(f,f'  ac   : mean {s.mean(b_AC):.1f}, stddev {s.stdev(b_AC):.1f}')
    u.echo(f,f'  numpy: mean {s.mean(b_numpy):.1f}, stddev {s.stdev(b_numpy):.1f}')
    u.echo(f,f'  tf   : mean {s.mean(b_tf):.1f}, stddev {s.stdev(b_tf):.1f}')
    u.echo(f,f'  array: mean {s.mean(b_array):.1f}, stddev {s.stdev(b_array):.1f}')
    
    u.echo(f,f'\neval time')
    u.echo(f,f'  ac / 1M : mean {s.mean(ac_per_mill):,}, stdev {s.stdev(ac_per_mill):.1f}')
    u.echo(f,f'  numpy/ac: mean {s.mean(numpy_ac):.1f}, stdev {s.stdev(numpy_ac):.1f}')
    u.echo(f,f'  tf/ac   : mean {s.mean(tf_ac):.1f}, stdev {s.stdev(tf_ac):.1f}')
    u.echo(f,f'  array/ac: mean {s.mean(array_ac):.1f}, stdev {s.stdev(array_ac):.1f}')
        
    all_time = time.perf_counter() - start_time
    u.echo(f,f'\n===Total Time: {all_time:.3f} sec (includes skipped circuits)') 
    f.close()
    u.verbose = saved
Пример #28
0
def eval(f,size,output,testing):
    circuit_type = 'TAC' if testing else 'AC'
    # get data (ground truth)
    evidence, marginals = rdata.get(size,output)
    ecount = len(marginals) # number of examples
        
    u.echo(f,f'\n==rectangle {size}x{size} images: {ecount} total')
    
    # get model
    bn, inputs = rmodel.get(size,output,testing=testing,use_bk=True,tie_parameters=False)
    
    # compile model
    s_time = time.time()
    u.echo(f,f'\ncompiling {circuit_type}:',end='')
    AC = tac.TAC(bn,inputs,output,trainable=False,profile=False)
    t = time.time()-s_time
    u.echo(f,f' {t:.1f} sec')
    u.echo(f,f'  {circuit_type} size {AC.size:,}\n  (sep) binary rank {AC.binary_rank:.1f}, rank {AC.rank}')
    
    # evaluate AC on evidence to get predictions
    u.echo(f,f'evaluating {circuit_type}:\n',end='',flush=True)
    predictions, t1, batch_size = AC.evaluate(evidence,report_time=True)
    u.echo(f,f'  batch size {batch_size}')
    u.echo(f,f'  {t1:.2f} sec, {1000*t1/ecount:.1f} ms per example')
Пример #29
0
def train_all(size,output,tries,data_sizes,testing,use_bk,tie_parameters,batch_size):
    start_time = time.time()
    
    fname = paths.exp / u.time_stamp(f'train_rect_{size}_{output}_{tries}_{testing}_{use_bk}_{tie_parameters}','txt')
    f     = open(fname,'w+')
    
    u.echo(f,f'\nrectangle {size} x {size}, output {output}, data_sizes {data_sizes}, testing {testing}, use_bk {use_bk}, tie {tie_parameters}\n')
    u.echo(f,f'fixed batch size {batch_size}')
    u.echo(f,'output logged into logs/exp/')
        
    def get_data(data_size):
        # full data
        t_evidence, t_labels = rdata.get(size,output,noisy_image_count=size,noise_count=size)
        v_evidence, v_labels = rdata.get(size,output,noisy_image_count=2*size,noise_count=2*size)
        # random subset
        t_percentage = data_size / len(t_labels)
        v_percentage = max(1000,data_size)/len(v_labels) # no less than 1000
        t_evidence, t_labels = data.random_subset(t_evidence,t_labels,t_percentage)
        v_evidence, v_labels = data.random_subset(v_evidence,v_labels,v_percentage)
        return t_evidence, t_labels, v_evidence, v_labels
    
    # get model
    net, inputs = rmodel.get(size,output,testing,use_bk,tie_parameters)
    # compile model into circuit
    circuit = tac.TAC(net,inputs,output,trainable=True,profile=False)
    u.echo(f,f'circuit size {circuit.size:,}, paramater count {circuit.parameter_count}\n')
    
    for data_size, count in zip(data_sizes,tries):
        u.echo(f,f'==data size {data_size}')
        t_evidence, t_labels, v_evidence, v_labels = get_data(data_size)
        u.echo(f,f'  train {len(t_labels)}, test {len(v_labels)}')
        u.echo(f,f'  accuracy ({count}):',end='',flush=True)
        sample = []
        for i in range(count):
            circuit.fit(t_evidence,t_labels,loss_type='CE',metric_type='CA',batch_size=batch_size)
            acc = 100*circuit.metric(v_evidence,v_labels,metric_type='CA')
            sample.append(acc)
            u.echo(f,f' {acc:.2f}',end='',flush=True)
        u.echo(f,f'\naccuracy mean {s.mean(sample):.2f}, std {s.stdev(sample):.2f}\n')
    
    all_time = time.time() - start_time
    u.echo(f,f'Total Time: {all_time:.3f} sec') 
    f.close()