def __init__(self,proxy,loop,localsession,id=0):
     self.proxy=proxy
     self.loop=loop
     self.session=CloudflareScraper(headers=genHeaders(),timeout=timeoutConfig,loop=loop)
     self.localsession=localsession
     self.id=id #用于标记这次投票是第几次
     #if proxy:
     self.fingerprint=md5((proxy+'ChNeWi').encode()).hexdigest()
示例#2
0
    async def tiktok_video_no_watermark(self, url: str):
        async with CloudflareScraper(
                headers={'user-agent': self._user_agent}) as session:
            page = await session.get('https://ssstiktok.io/ru')
            if page.status != 200:
                return {'url': None}
            soup = BeautifulSoup(
                await page.text(),
                features="html.parser")  # Иницилизация обработки HTML тегов
            form = soup.find(class_='pure-form pure-g hide-after-request')
            endpoint = form['data-hx-post']
            vals = form['include-vals']
            m = self._re_donor_no_wm.search(vals)
            if not m:
                return {'url': None}
            tt, ts = m.group('tt', 'ts')

            response = await session.post(f'https://ssstiktok.io{endpoint}',
                                          data={
                                              'id': url,
                                              'locale': 'ru',
                                              'tt': tt,
                                              'ts': ts
                                          })
            soup = BeautifulSoup(await response.text(), features="html.parser")

            for link in soup.find_all('a'):
                return {
                    'url': base64_decode(link['href'].split('/dl?url=').pop())
                }
        return {'url': None}
async def test(id):

    print('Instantiating ' + id + ' exchange')

    session = CloudflareScraper(loop=asyncio.get_event_loop())

    # instantiate the exchange by id
    exchange = getattr(ccxt, id)({
        'timeout': 20000,
        'session': session,
    })

    markets = None

    try:

        # load all markets from the exchange
        markets = await exchange.load_markets()

    except ccxt.BaseError as e:

        print(type(e).__name__, str(e))
        print('Failed.')

    await exchange.close()
    await session.close()

    return markets
示例#4
0
 async def post(self, url, payload=None, headers: dict = {}, loop=None):
     headers["XSRF-TOKEN"] = self.csrf_token
     async with CloudflareScraper(loop=loop) as session:
         async with session.post(url=url, data=payload,
                                 headers=headers) as resp:
             data = await resp.text()
     return data
示例#5
0
    async def tiktok_by_url(self, url: str):
        m_pc = self._re_tiktok_pc_url.search(url)
        m_mobile = self._re_tiktok_mobile_url.search(url)
        m_mobile_2 = self._re_tiktok_mobile_2_url.search(url)

        if m_pc:
            return await self.tiktok_by_id(int(m_pc.group('tiktok_id')))
        elif m_mobile:
            return await self.tiktok_by_id(int(m_mobile.group('tiktok_id')))
        elif m_mobile_2:
            async with CloudflareScraper(
                    loop=self._loop,
                    headers={
                        'authority': 'm.tiktok.com',
                        'accept': 'application/json, text/plain, */*',
                        'accept-encoding': 'gzip, deflate',
                        'accept-language': 'en-US,en;q=0.9',
                        'referrer': 'https://www.tiktok.com/',
                        'sec-fetch-dest': 'empty',
                        'sec-fetch-mode': 'cors',
                        'sec-fetch-site': 'same-site',
                        'user-agent': self._user_agent
                    }) as session:
                async with session.get(url) as response:
                    m_pc = self._re_tiktok_pc_url.search(str(response.url))
                    if m_pc:
                        return await self.tiktok_by_id(
                            int(m_pc.group('tiktok_id')))
            return False
        else:
            return False
示例#6
0
文件: cfcrawl.py 项目: ra2003/Scrapio
 async def _create_client_session(self):
     async with self._creation_semaphore:
         try:
             from aiocfscrape import CloudflareScraper
         except ImportError:
             raise ImportError('Cfcrawler requires aiocfscrape')
         self._client = CloudflareScraper()
示例#7
0
async def get_election_offices():
    """Starting point of the scraper program. Scrapes BASE_URL for election office
    information and both dumps results to a .json file and returns the results as json.

    @return: list of scraped results as json.
    """
    # Get list of county names from registrar to populate form
    # Define coroutine functions (context managers)
    async with CloudflareScraper() as session:
        async with session.get(BASE_URL) as s:
            # ClientResponse.read() is a coroutine function so it must be awaited
            text = await s.read()
        soup = bS(text, "html5lib")

        info_list = soup.findAll("area")
        counties = [info['alt'] for info in info_list]
        county_urls = [info['href'] for info in info_list]

        # Use list of counties and IDs to get county info for each county
        tasks: List[Task] = []
        num_scraped = 0
        master_list = []

        for i in range(len(counties)):
            # Create task for a future asynchronous operation and store it in task list
            tasks.append(
                asyncio.create_task(
                    scrape_one_county(session, counties[i], county_urls[i])))

        # Run the coroutines and iterate over the yielded results as they complete
        # (out-of-order). Use asyncio.gather() with a couple code modifications to
        # preserve list order
        future: Future[Tuple[str, str, str, str, str]]
        for future in asyncio.as_completed(tasks):
            # Unpack awaited result of scrape_one_county()
            (
                address,
                county_website,
                phone_number,
                email_address,
                county_name,
            ) = await future
            schema = format_data_into_schema(
                address,
                county_website,
                phone_number,
                email_address,
                county_name,
            )
            master_list.append(schema)
            num_scraped += 1
            print(f"[New York] Scraped {county_name} county: "
                  f"#{num_scraped} of {len(counties)} .... "
                  f"[{round((num_scraped / len(counties)) * 100, 2)}%]")
    master_list = sorted(master_list, key=lambda county: county['countyName'])

    with open(os.path.join(ROOT_DIR, "scrapers", "new_york", "new_york.json"),
              "w") as f:
        json.dump(master_list, f)
    return master_list
示例#8
0
 async def get(self, url, loop=None):
     async with CloudflareScraper(loop=loop) as session:
         async with session.get(url) as resp:
             data = await resp.text()
     csrf_token = resp.cookies["XSRF-TOKEN"].value
     if not self.csrf_token or self.csrf_token != csrf_token:
         self.csrf_token = csrf_token
     return data
示例#9
0
    async def request(self,
                      url: str,
                      kwargs: dict = {},
                      return_bytes=False,
                      payload=None) -> dict:
        async with CloudflareScraper(
                loop=self._loop,
                headers={
                    'authority':
                    'm.tiktok.com',
                    'accept':
                    'application/json, text/plain, */*',
                    'accept-encoding':
                    'gzip, deflate',
                    'accept-language':
                    'en-US,en;q=0.9',
                    'referrer':
                    'https://m.tiktok.com/',
                    'sec-fetch-dest':
                    'empty',
                    'sec-fetch-mode':
                    'cors',
                    'sec-fetch-site':
                    'same-site',
                    'user-agent':
                    self._user_agent,
                    'cookie':
                    ';'.join([
                        f'{key}={value}'
                        for key, value in self.browser.cookies.items()
                    ])
                }) as session:
            url = await self._browser.signature(url, kwargs)
            if payload is not None:
                async with session.post(url, json=payload) as response:
                    return await response.text()

            async with session.get(url) as response:
                if return_bytes:
                    return response.content
                try:
                    _json = await response.json(content_type=None)
                    code = _json.get('code', -1)
                    if code != '10000':
                        return _json
                    return await self.captcha(_json, url, kwargs, return_bytes)

                except Exception as e:
                    logging.error(e, exc_info=True)
                    print(
                        f'Failed on {url}; Converting to json error; Text: {await response.text()}'
                    )
                    raise Exception('Invalid Response!!!')
示例#10
0
 async def coin_name(self, symbol: str) -> str:
     try:
         async with CloudflareScraper() as session:
             async with session.get(
                     'https://liqui.io/Market/Currencies/') as resp:
                 currencies = await resp.json()
     except Exception as e:
         raise LiquiPairNamesException(e)
     coin_name = next(
         (i['Name'] for i in currencies if i['Symbol'] == symbol), None)
     if not coin_name:
         raise LiquiPairNamesException(f'cannot find coin {symbol!r}')
     return coin_name
示例#11
0
async def get_election_offices():
    """Starting point of the scraper program. Scrapes BASE_URL for election office
    information and both dumps results to a .json file and returns the results as json.

    @return: list of scraped results as json.
    """
    # Define coroutine functions (context managers)
    async with CloudflareScraper() as session:
        async with session.get(BASE_URL) as s:
            # ClientResponse.read() is a coroutine function so it must be awaited
            text = await s.read()
        soup = bS(text.decode("utf-8"), "html.parser")

        test_county_data = get_county_codes_and_names(soup)
        county_data = sorted(test_county_data, key=lambda k: k["countyName"])
        num_scraped = 0
        master_list = []

        # Create list that will store asyncio tasks
        tasks: List[Task] = []
        for county in county_data:
            code = county["countyCode"]
            name = county["countyName"]
            # Create task for a future asynchronous operation and store it in task list
            tasks.append(asyncio.create_task(scrape_one_county(session, code, name)))

        # Run the coroutines and iterate over the yielded results as they complete
        # (out-of-order). Use asyncio.gather() with a couple code modifications to
        # preserve list order
        future: Future[Tuple[str, str, str, str]]
        for future in asyncio.as_completed(tasks):
            # Unpack awaited result of scrape_one_county()
            cleaned_string, protected_email, _, county_name = await future
            schema = format_data_into_schema(
                cleaned_string, protected_email, county_name
            )
            master_list.append(schema)
            num_scraped += 1
            print(
                f"[Florida] Scraped {county_name} county: "
                f"#{num_scraped} of {len(county_data)} .... "
                f"[{round((num_scraped / len(county_data)) * 100, 2)}%]"
            )

    with open(os.path.join(ROOT_DIR, "scrapers", "florida", "florida.json"), "w") as f:
        json.dump(master_list, f)
    return master_list
async def cs_page(url):
    async with CloudflareScraper() as session:
        async with session.get(url) as resp:
            return await resp.text()
示例#13
0
async def read_logs():
    values = ('user', 'password', 'serverid', 'loc', 'folder', 'admin_file',
              'admin_line', 'chat_file', 'chat_line', 'kill_file', 'kill_line',
              'login_file', 'login_line', 'violations_file', 'violations_line')
    print(
        'scumlogs v1.0, scum server logs downloader from gportal\nby htttps://GAMEBotLand.com'
    )
    try:
        load_configini()
    except:
        global configini
        configini = {}
    for value in values:
        if value not in configini:
            configini[value] = ''
    if configini['folder'] != '':
        if configini['folder'][-1:] != '/' and configini['folder'][-1:] != '\\':
            configini['folder'] = configini['folder'] + '/'
    save_configini()

    if configini['loc'] == 'com':
        loc = 'com'
    else:
        loc = 'us'
    URL_LOGIN = '******'.format(
        configini['loc'])
    URL_LOGS = 'https://www.g-portal.{}/en/scum/logs/{}'.format(
        configini['loc'], configini['serverid'])

    async with CloudflareScraper() as session:
        try:
            log('connecting g-portal...')
            payload = {
                '_method': 'POST',
                'login': configini['user'],
                'password': configini['password'],
                'rememberme': '1'
            }
            async with session.post(URL_LOGIN, data=payload) as raw_response:
                response = await raw_response.text()
            async with session.get(URL_LOGS) as raw_response:
                response = await raw_response.text()
            html = BeautifulSoup(response, 'html.parser')
            select = html.find('div', {'class': 'wrapper logs'})
            loglist = select['data-logs']
            logs = json.loads(loglist)

            for i in range(len(logs)):
                getid = logs["file_" + str(i + 1)]
                id = (getid[int(getid.find('Logs')) + 5:])
                type = id.split('_')[0]

                if configini[type + '_file'] != '':
                    if id < configini[type + '_file']:
                        continue
                payload = {
                    '_method': 'POST',
                    'load': 'true',
                    'ExtConfig[config]': getid
                }
                async with session.post(URL_LOGS,
                                        data=payload) as raw_response:
                    response = await raw_response.text()
                content = json.loads(response)
                lines = content["ExtConfig"]["content"].splitlines()
                filename = configini['folder'] + id
                file = open(filename, "a+", encoding='utf-8')
                found = False
                writing = False
                for line in lines:
                    if id == configini[type + '_file'] and not found:
                        if line == configini[type + '_line']:
                            found = True
                            continue
                    else:
                        file.write(line + '\n')
                        writing = True
                if writing:
                    if found:
                        log('updating {}'.format(id))
                    else:
                        log('creating {}'.format(id))
                file.close()
                configini[type + '_file'] = id
                configini[type + '_line'] = lines[-1]

            save_configini()
        except:
            log('error connecting, check connectivity and scumlogs.ini')
            help()
        await session.close()
示例#14
0
 async def __get_js(self):
     async with CloudflareScraper(loop=self._loop, headers={}) as session:
         async with session.get(
                 'https://sf-tb-sg.ibytedtos.com/obj/rc-web-sdk-sg/acrawler.js'
         ) as response:
             return await response.text()
示例#15
0
async def url_2_image(url: str):
    async with CloudflareScraper() as session:
        async with session.get(url) as response:
            return await response.read()
示例#16
0
                future.add_done_callback(functools.partial(printer))
                task = self.PostFingerprint()
                future = asyncio.ensure_future(task, loop=self.loop)
                future.add_done_callback(functools.partial(printer))
                return ('%d %s %s' % (self.id, self.proxy, result))
            if ('refresh' in result):  #session expired等各种原因
                print('%d %s %s %s' %
                      (self.id, self.proxy, result, '开始重试整个投票流程'))
                await self.Vote()  #再来一次!
            #if('An entry' in result): #这个ip被抢先投票
            return ('%d %s %s' % (self.id, self.proxy, result))  #结束投票
        except RetryExhausted:
            return ('%d %s %s' % (self.id, self.proxy, '连续重试次数超限'))
        except (aiohttp.ClientError, asyncio.TimeoutError):
            return ('%d %s %s' % (self.id, self.proxy, '代理可能失效,放弃治疗'))

    def Launch(self):
        vote = self.Vote()
        vote_future = asyncio.ensure_future(vote, loop=self.loop)
        vote_future.add_done_callback(functools.partial(printer))


##        res=await vote_future
##        if(res==300):
##            vote_future.add_done_callback(functools.partial(self.Launch))

if __name__ == '__main__':
    voter = Voter('192.168.1.1:9999', asyncio.get_event_loop(),
                  CloudflareScraper())
    print(voter)
示例#17
0
                future=asyncio.ensure_future(task,loop=self.loop)
                future.add_done_callback(functools.partial(printer))
                task=self.PostFingerprint()
                future=asyncio.ensure_future(task,loop=self.loop)
                future.add_done_callback(functools.partial(doNothing))
                return('%d %s %s'%(self.id,self.proxy,result))
            if('refresh' in result): #session expired等各种原因
                print('%d %s %s %s'%(self.id,self.proxy,result,'开始重试整个投票流程'))
                if random.random()<0.6:
                    await self.Vote() #再来一次!
                else:
                    return('%d %s %s %s'%(self.id,self.proxy,result,'放弃治疗')) #结束投票
            #if('An entry' in result): #这个ip被抢先投票
            return('%d %s %s'%(self.id,self.proxy,result)) #结束投票
        except RetryExhausted:
            return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限'))
        except (aiohttp.ClientError,asyncio.TimeoutError):
            return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗'))

    def Launch(self):
        vote=self.Vote()
        vote_future=asyncio.ensure_future(vote,loop=self.loop)
        vote_future.add_done_callback(functools.partial(printer))
##        res=await vote_future
##        if(res==300):
##            vote_future.add_done_callback(functools.partial(self.Launch))

if __name__=='__main__':
    voter=Voter('192.168.1.1:9999',asyncio.get_event_loop(),CloudflareScraper())
    print(voter)
示例#18
0
class Voter:
    #session=CloudflareScraper(headers=headers,timeout=timeoutConfig)
    #localsession=从外部输入一个不使用代理的CloudflareScraper session
    #proxy=从外部输入一个代理ip
    #global_retry=10
    #如果进世萌后时间超过global_timeout,
    #且累计连接失败次数超过global_retry
    #则放弃这次投票
    #这个策略似乎没啥意义。先不用

    def __init__(self,proxy,loop,localsession,id=0):
        self.proxy=proxy
        self.loop=loop
        self.session=CloudflareScraper(headers=genHeaders(),timeout=timeoutConfig,loop=loop)
        self.localsession=localsession
        self.id=id #用于标记这次投票是第几次
        #if proxy:
        self.fingerprint=md5((proxy+'ChNeWi').encode()).hexdigest()
        #else:#仅用于测试!
        #    self.fingerprint=md5(('Hecate2'+str(time.time())).encode()).hexdigest()
        #    self.voting_token=sha256(('Hecate2'+str(time.time())).encode()).hexdigest()
        #self.localsession=localsession

    def retry(self,*exceptions, retries=5, cooldown=0):#, verbose=True):
        """Decorate an async function to execute it a few times before giving up.
        Hopes that problem is resolved by another side shortly.

        Args:
            exceptions (Tuple[Exception]) : The exceptions expected during function execution
            retries (int): Number of retries of function execution.
            cooldown (int): Seconds to wait before retry.
            verbose (bool): Specifies if we should log about not successful attempts.
        """

        def wrap(func):
            @wraps(func)
            async def inner(*args, **kwargs):
                #子函数可以访问父函数的所有变量,因此这里可以访问class的self
                retries_count = 0

                while True:
                    try:
                        result = await func(*args, **kwargs)
                    except exceptions as err:   #exceptions是从retry传入的
                        #self.global_retry -= 1
                        retries_count += 1

                        if retries_count >= retries:
                            message='已连续错误%d次,放弃治疗'%(retries)
                            #print(self.id,self.proxy,message)
                            print(message)
                            #verbose and log.exception(message)
                            #verbose and print(message)
                            #raise RetryExhaustedError(
                            #    func.__qualname__, args, kwargs) from err
                            #raise RetryExhaustedError
                            #return err
                            #return '还没想好return什么东西'
                            raise RetryExhausted
                        else:
                            #message = "Exception:{} during\n{} execution. " \
                            #          "{} of {} retries attempted"\
                            #          .format(err, func, retries_count, retries)
                            message= '出现错误:{}. 正在重试{}/{}'\
                                     .format(err, retries_count, retries)
                            #print(self.id,self.proxy,message)
                            print(message)
                            #verbose and log.warning(message)
                            #verbose and print(message)
                            await asyncio.sleep(cooldown)
                    else:
                        return result
            return inner
        return wrap

    @retry(aiohttp.ClientError,asyncio.TimeoutError)
    async def _get(self, url, timeout=timeoutConfig):
        #自动判断get到的是文字还是图片,返回utf-8编码的文字或bytes类型图片
        async with self.session.get(url,proxy=self.proxy,timeout=timeout) as response:
            #return await response.text()
            body=await response.read()
            #print(response.content_type)    #'text/html' 'image/png'
            #print(body)
            if (response.status<400):
                if 'text' in response.content_type:
                    #text=body.decode(encoding='utf-8')
                    text=await response.text(errors='ignore')
                    #f=open('./tmp.txt','a',encoding='utf-8')
                    #f.write(text)
                    #f.close()
                    return text
                #if 'image' in response.content_type:
                else:
                    #fb=open('./tmp.png','wb')
                    #fb.write(body)
                    #fb.close()
                    return body
            #if (response.status==503):
                #pass
                #处理cloudflare防火墙
            else:
                response.raise_for_status()
                print('get连续失败太多次!')

    #不使用代理,直接get
    #暂时不要对着世萌用这个。如果一开始没有墙,突然墙开起来了,可能会出问题
    #因为同一个本地session会发起几百几千个get,吃到不同的墙
    @retry(aiohttp.ClientError,asyncio.TimeoutError)
    async def _localget(self, url, timeout=captchaTimeoutConfig):
        #用本机ip去get!
        #自动判断get到的是文字还是图片,返回utf-8编码的文字或bytes类型图片
        async with self.localsession.get(url,timeout=timeout) as response:
            #return await response.text()
            body=await response.read()
            #print(response.content_type)    #'text/html' 'image/png'
            #print(body)
            if (response.status<400):
                if 'text' in response.content_type:
                    text=body.decode(encoding='utf-8')
                    #f=open('./tmp.txt','a',encoding='utf-8')
                    #f.write(text)
                    #f.close()
                    return text
                #if 'image' in response.content_type:
                else:
                    #fb=open('./tmp.png','wb')
                    #fb.write(body)
                    #fb.close()
                    return body
            #if (response.status==503):
                #pass
                #处理cloudflare防火墙
            else:
                response.raise_for_status()
                print('localget连续失败太多次!')

    @retry(aiohttp.ClientError,asyncio.TimeoutError)
    async def _post(self,url,data,timeout=timeoutConfig):
        async with self.session.post(url,data=data,proxy=self.proxy,timeout=timeout) as response:
            text=await response.text()
            if (response.status<400):
                return text
            else:
                response.raise_for_status()
                print('post连续失败太多次!')

    #本机向验证码服务器post,不使用代理
    #只能用于验证码post!只允许返回text!对于二进制内容会出错!
    @retry(aiohttp.ClientError,asyncio.TimeoutError)
    async def _localpost(self,url,data,timeout=timeoutConfig):
        async with self.localsession.post(url,data=data,timeout=timeout) as response:
            text=await response.text()
            if (response.status<400 and text!='!'):
                return text
            else:
                response.raise_for_status()
                print('localpost连续失败太多次!验证码服务器可能有严重问题!')

    async def EnterISML(self):
        text=await self._get('http://www.internationalsaimoe.com/voting')
        #text=await self._get('https://www.internationalsaimoe.com/voting?lang=zh-hans')
        voting_token = re.search(repattern, text)
        if voting_token:
            self.html=text
            self.voting_token=voting_token.group(1)
            self.startTime=time.time()
            print(self.id,self.proxy,'进入ISML成功')
        else:
            print(self.id,self.proxy,'找不到voting_token')
            raise NoVotingToken

    #发指纹和打码可以并发执行
    async def PostFingerprint(self):#确保self.fingerprint在class初始化时已经生成!
        await self._post("https://www.internationalsaimoe.com/security",data={"secure":self.fingerprint})
        print(self.id,self.proxy,'发指纹成功')
        return('%d %s 发指纹成功'%(self.id,self.proxy))

    async def AIDeCaptcha(self):
        #打码。包含多次下载验证码,预处理,以及交给服务器最终识别
        tries=0
        while 1:#while tries<重试上限:
        #目前验证码重试次数不设上限!
            tries+=1
            raw_img=await self._get(
                'https://www.internationalsaimoe.com/captcha/%s/%s' % (self.voting_token, int(time.time() * 1000)),
                timeout=captchaTimeoutConfig)
            img=Image.open(BytesIO(raw_img))
            img = 255-np.array(img.convert('L') ) #转化为灰度图
            if(judge(img)):
                del img
                print(self.id,self.proxy,'第%d次获取验证码,能够识别'%(tries))
                captcha=await self._localpost(next(csGen),raw_img)
                self.captcha=captcha
                return captcha

    async def DeCaptcha(self):
        #打码。直接丢给打码平台处理
        #captcha=await self._get()
        #self.captcha='打码结果'
        print('丢给打码平台的版本还未完成!')
        await asyncio.sleep(0)
        raise
        
    async def Submit(self):#提交投票
        postdata=selector(self.html,self.voting_token,self.captcha)
        sleepTime=120-(time.time()-self.startTime)#消耗的时间减去90秒
        if(sleepTime>0):#还没到90秒
            print(self.id,self.proxy,'开始等待%d秒'%(sleepTime))
            await asyncio.sleep(sleepTime)#坐等到90秒
        print(self.id,self.proxy,'开始Submit')
        result=await self._post("https://www.internationalsaimoe.com/voting/submit",data=postdata)
        return result

    async def SaveHTML(self):#存票根
        text=await self._get('https://www.internationalsaimoe.com/voting')
        #text=await self._get('https://www.internationalsaimoe.com/voting?lang=zh-hans')
        try:
            f=open('./HTML/%s.html'%(self.captcha),'w',encoding=('utf-8'))
            f.write(text)
            f.close()
            #print(self.id,self.proxy,'存票根成功')
            return('%d %s 存票根成功'%(self.id,self.proxy))
        except Exception:
            return('%d %s 由于硬盘原因,存票根失败,可能硬盘过载!!!!!'%(self.id,self.proxy))

    #@retry(aiohttp.ClientError,asyncio.TimeoutError,retries=2)
    async def Vote(self):#跑完整个投票流程!建议由Launch函数启动
        try:
            await self.EnterISML()
        except NoVotingToken:
            return('%d %s %s'%(self.id,self.proxy,'找不到voting_token'))
        except RetryExhausted:
            return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限'))
        except (aiohttp.ClientError,asyncio.TimeoutError):
            return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗'))
        try:
##            #下面开始发指纹并且暂时不管它。识别验证码与发指纹并发执行
##            task=self.PostFingerprint()
##            future=asyncio.ensure_future(task,loop=self.loop)
##            #下面开始识别验证码任务。
##            await self.AIDeCaptcha()
##            #下面等发指纹任务完成(通常早就完成了)
##            await future
            await self.PostFingerprint()
            await self.AIDeCaptcha()
            #下面坐等到90秒然后submit
            result=await self.Submit()
            #下面应对验证码错误(重试)
            if('Invalid' in result):#验证码错误
                result=await self.AIDeCaptcha()
                await self.Submit()
            if('Invalid' in result):#验证码错误
                result=await self.AIDeCaptcha()
                await self.Submit()
            #下面存票根
            if('successful' in result):
                task=self.SaveHTML()
                future=asyncio.ensure_future(task,loop=self.loop)
                future.add_done_callback(functools.partial(printer))
                task=self.PostFingerprint()
                future=asyncio.ensure_future(task,loop=self.loop)
                future.add_done_callback(functools.partial(doNothing))
                return('%d %s %s'%(self.id,self.proxy,result))
            if('refresh' in result): #session expired等各种原因
                print('%d %s %s %s'%(self.id,self.proxy,result,'开始重试整个投票流程'))
                if random.random()<0.6:
                    await self.Vote() #再来一次!
                else:
                    return('%d %s %s %s'%(self.id,self.proxy,result,'放弃治疗')) #结束投票
            #if('An entry' in result): #这个ip被抢先投票
            return('%d %s %s'%(self.id,self.proxy,result)) #结束投票
        except RetryExhausted:
            return('%d %s %s'%(self.id,self.proxy,'连续重试次数超限'))
        except (aiohttp.ClientError,asyncio.TimeoutError):
            return('%d %s %s'%(self.id,self.proxy,'代理可能失效,放弃治疗'))

    def Launch(self):
        vote=self.Vote()
        vote_future=asyncio.ensure_future(vote,loop=self.loop)
        vote_future.add_done_callback(functools.partial(printer))
示例#19
0
async def get_election_offices():
    """Starting point of the scraper program. Scrapes BASE_URL for election office
    information and both dumps results to a .json file and returns the results as json.

    @return: list of scraped results as json.
    """
    # Get list of county names from registrar to populate form
    # Define coroutine functions (context managers)
    async with CloudflareScraper() as session:
        async with session.get(REGISTRAR_URL) as s:
            # ClientResponse.read() is a coroutine function so it must be awaited
            text = await s.read()
        soup = bS(text, "html5lib")

        county_option_list = soup.findAll(
            attrs={"name": "idTown"})[0].findAll("option")

        id_list = [
            county_option["value"] for county_option in county_option_list
        ]
        county_list = [
            county_option.string for county_option in county_option_list
        ]

        # Use list of counties and IDs to get county info for each county
        tasks: List[Task] = []
        num_scraped = 0
        master_list = []

        for i in range(len(id_list)):
            county_id = id_list[i]
            county_name = county_list[i]

            # Create task for a future asynchronous operation and store it in task list
            tasks.append(
                asyncio.create_task(
                    scrape_one_county(session, county_id, county_name)))

        # Run the coroutines and iterate over the yielded results as they complete
        # (out-of-order). Use asyncio.gather() with a couple code modifications to
        # preserve list order
        future: Future[Tuple[str, str, str, str, str, str]]
        for future in asyncio.as_completed(tasks):
            # Unpack awaited result of scrape_one_county()
            (
                registrar_name,
                phys_address,
                mail_address,
                phone_number,
                email_address,
                county_name,
            ) = await future
            schema = format_data_into_schema(
                registrar_name,
                phys_address,
                mail_address,
                phone_number,
                email_address,
                county_name,
            )
            master_list.append(schema)
            num_scraped += 1
            print(f"[Georgia] Scraped {county_name} county: "
                  f"#{num_scraped} of {len(county_list)} .... "
                  f"[{round((num_scraped / len(county_list)) * 100, 2)}%]")

    with open(os.path.join(ROOT_DIR, "scrapers", "georgia", "georgia.json"),
              "w") as f:
        json.dump(master_list, f)
    return master_list
示例#20
0
async def read_logs():
    result_chat_lines = []
    result_kill_lines = []
    values = ('user', 'password', 'serverid', 'loc', 'folder', 'admin_file',
              'admin_line', 'chat_file', 'chat_line', 'kill_file', 'kill_line',
              'login_file', 'login_line', 'violations_file', 'violations_line')

    try:
        load_configini()
    except Exception:
        global configini
        configini = {}
    for value in values:
        if value not in configini:
            configini[value] = ''
    if configini['folder'] != '':
        if configini['folder'][-1:] != '/' and configini['folder'][-1:] != '\\':
            configini['folder'] = configini['folder'] + '/'
    save_configini()

    URL_LOGIN = '******'.format(
        configini['loc'])
    URL_LOGS = 'https://www.g-portal.{}/en/scum/logs/{}'.format(
        configini['loc'], configini['serverid'])

    async with CloudflareScraper() as session:
        try:
            log('connecting g-portal...')
            payload = {
                '_method': 'POST',
                'login': configini['user'],
                'password': configini['password'],
                'rememberme': '1'
            }

            async with session.post(URL_LOGIN, data=payload) as raw_response:
                response = await raw_response.text()
            async with session.get(URL_LOGS) as raw_response:
                response = await raw_response.text()

            html = BeautifulSoup(response, 'html.parser')
            select = html.find('div', {'class': 'wrapper logs'})
            loglist = select['data-logs']
            logs = json.loads(loglist)

            for i in range(len(logs)):
                getid = logs["file_" + str(i + 1)]
                id = (getid[int(getid.find('Logs')) + 5:])
                type = id.split('_')[0]

                if type in ['chat', 'kill']:
                    if configini[type + '_file'] != '':
                        if id < configini[type + '_file']:
                            continue

                    payload = {
                        '_method': 'POST',
                        'load': 'true',
                        'ExtConfig[config]': getid
                    }
                    async with session.post(URL_LOGS,
                                            data=payload) as raw_response:
                        response = await raw_response.text()
                    content = json.loads(response)
                    lines = content["ExtConfig"]["content"].splitlines()

                    found = False
                    writing = False
                    for line in lines:
                        #  Replace all "%" symbols to ";" coz
                        # .ini files can't save "%" symbol.
                        if "%" in line:
                            ready_line = ""
                            # continue
                            for i in line:
                                if i == "%":
                                    ready_line += ";"
                                else:
                                    ready_line += i
                            index = lines.index(line)
                            lines[index] = ready_line
                        if id == configini[type + '_file'] and not found:
                            if line == configini[type + '_line']:
                                found = True
                                continue
                            # Replace all ";" symbold to ";"
                            elif line.find("%") > -1:
                                ready_line = ""
                                for i in line:
                                    if i == "%":
                                        ready_line += ";"
                                    else:
                                        ready_line += i
                                if ready_line == configini[type + "_line"]:
                                    found = True
                                    continue
                        else:
                            if type == "chat":
                                result_chat_lines.append(line)
                            else:
                                result_kill_lines.append(line)
                            writing = True
                    if writing:
                        if found:
                            log('updating {}'.format(id))
                        else:
                            log('creating {}'.format(id))
                    # file.close()
                    configini[type + '_file'] = id
                    configini[type + '_line'] = lines[-1]
            save_configini()
            if not result_chat_lines and not result_kill_lines:
                return [], False
            return [result_chat_lines, result_kill_lines], True

        except Exception:
            print(traceback.format_exc())
            return [], False
        await session.close()
示例#21
0
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": uaGen.random,
}

##def printer(future):
##    print(future.result())

request_timeout = 50  #单次http请求的默认超时。
#你可以随时暂时覆盖这一设置
captcha_timeout = 60  #单次取验证码的默认超时
timeoutConfig = aiohttp.ClientTimeout(total=request_timeout)
captchaTimeoutConfig = aiohttp.ClientTimeout(total=captcha_timeout)

localsession = CloudflareScraper(headers=headers,
                                 loop=worker_loop,
                                 timeout=timeoutConfig)


#async def localsession_get(url='https://coinone.co.kr/'):#珂以测试防火墙
async def localsession_get(url="https://www.internationalsaimoe.com"):
    async with localsession.get(url) as res:
        text = await res.text()
        return ('Ignaleo:本地session请求%s,状态码为%d' % (url, res.status))
        #print('Ignaleo:本地session请求%s,状态码为%d'%(url,res.status))
        #return res.status


##    await asyncio.sleep(80)
##    async with localsession.post(url,data=b'test',ssl=False) as res:
##        text = await res.text()