Exemplo n.º 1
0
def get_articles(profile):
    session = requests_html.HTMLSession()
    medium_link = f'https://medium.com/feed/@{profile}'.replace('@@', '@')
    r = session.get(medium_link)
    for i in r.html.find('channel')[0].find('item'):
        link = i.find('link')[0].html.replace('<link/>', '')
        pub_date = i.find('pubDate')[0].text
        categories = i.find('category')
        yield link, get_date_formatted(pub_date), categories
Exemplo n.º 2
0
def getPricesOnline():
    for key in stocks:
        url = 'https://in.finance.yahoo.com/quote/' + key
        session = requests_html.HTMLSession()
        r = session.get(url)
        content = BeautifulSoup(r.content, 'lxml')
        price = str(content).split('data-reactid="32"')[4].split(
            '</span>')[0].replace('>', '')
        price = float(price.replace(',', ''))
        stocks[key] = price
Exemplo n.º 3
0
    def __init__(self):
        self.db_init = DBInit()
        self.edb_dao = EDBDao(self.db_init.session)

        self.session = requests_html.HTMLSession()
        self.session.keep_alive = False
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36',
        }
Exemplo n.º 4
0
    def __init__(self):
        """
        Crawls.
        """
        self.http_session = requests_html.HTMLSession()
        logger.debug("Sending GET request to start_url...")
        self.start_response = self.http_session.get(url=config.start_url)
        logger.debug("Received %s response.", self.start_response.status_code)

        self.request_count = 1
Exemplo n.º 5
0
 def __init__(self, docentes_dataset, basename='scholar',
              delay_bounds_secs=(7, 27), short_fraction=4,
              **kwargs):
     super().__init__(basename+'.csv', None, **kwargs)
     self.basename = basename
     self.docentes_dataset = docentes_dataset
     self.delay_bounds_secs = delay_bounds_secs
     self.short_fraction = short_fraction
     self.session = requests_html.HTMLSession()
     self.delay_pending = False
Exemplo n.º 6
0
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word) or _skip_date(date, config.cut_off_date):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            yield word, pron
Exemplo n.º 7
0
def getkvalue(keywords):
    s = requests_html.HTMLSession()
    url = "http://search.anccnet.com/searchResult2.aspx?keyword={}".format(keywords)
    header = {
        "referer": "http://www.anccnet.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    rsp = s.get(url, headers=header)
    k = rsp.html.find('#yanzhengLabel')[0].attrs['data-site-key']
    s.close()
    return k
Exemplo n.º 8
0
    def __init__(self):
        db_init = DBInit()
        self.msf_dao = MSFDao(db_init.session)

        self.session = requests_html.HTMLSession()
        self.session.keep_alive = False
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36',
        }
        logging.basicConfig(level=logging.INFO)
Exemplo n.º 9
0
 def __init__(self):
     """Initialize the class, session, etc."""
     self.logged_in = False
     self.session = requests_html.HTMLSession(mock_browser=False)
     self.session.headers['User-Agent'] = constants.REQ_USER_AGENT
     self.ibank_url = constants.BNI_IBANK_URL
     self.ib_account_name = ''
     self.first_url = ''
     self.last_req = None
     self.session_ttl = constants.SESSION_TTL
     self.session_time = None
Exemplo n.º 10
0
 def start_requests(self):
     urls = []
     cur_url = "http://club.xywy.com/keshi/{}.html"
     sess = requests_html.HTMLSession()
     for i in range(1, 65):
         response = sess.get(cur_url.format(i))
         tmp_urls = response.html.xpath(
             "//ul[@class='club_Date clearfix']/li/a/@href")
         urls += tmp_urls
     for url in urls:
         yield scrapy.Request(url, callback=self.parse)
Exemplo n.º 11
0
 def __init__(self):
     self.session = requests.HTMLSession()
     self.token = None
     self.datalist = list()
     self.headers_data = {
         'Host': 'webshare.cz',
         'Referer': Webshare.webshare_url + '/',
         'Origin': Webshare.webshare_url,
         'X-Requested-With': 'XMLHttpRequest',
         'Accept': 'text/xml; charset=UTF-8'
     }
Exemplo n.º 12
0
def get_sse_one_page(page_num: str = 1, page_size: str = 15) -> pd.DataFrame:
    ss = requests_html.HTMLSession()
    ss.headers[
        'Referer'] = 'http://www.sse.com.cn/disclosure/credibility/supervision/inquiries/'
    url = f'http://query.sse.com.cn/commonSoaQuery.do?siteId=28&sqlId=BS_KCB_GGLL&channelId=10743%2C10744%2C10012&extGGDL=&order=createTime|desc%2Cstockcode|asc&isPagination=true&pageHelp.pageSize={page_size}&pageHelp.pageNo={page_num}'
    res = ss.get(url)
    df = pd.DataFrame(res.json()['result'])
    df = df[[
        'stockcode', 'extGSJC', 'cmsOpDate', 'extWTFL', 'docTitle', 'docURL'
    ]]
    df['docURL'] = df['docURL'].apply(lambda x: x.split('/')[-1])
    return df
Exemplo n.º 13
0
def requests(url_list, progressbar):

    html = []

    with requests_html.HTMLSession() as session:
        for url in url_list:
            r = session.get(url)
            html.append(r.text)
            progressbar.UpdateBar(
                round(100 / len(url_list) * (url_list.index(url) + 1)))

    return html
Exemplo n.º 14
0
def htmlRequest(url: str) -> requests_html.HTMLResponse:
    nvlog.debug("Starting requests session...")
    session = requests_html.HTMLSession()
    nvlog.info(f"Making GET request: url={url}")
    response = session.get(url)
    nvlog.info(f"Response received ({response.status_code})")
    nvlog.debug("Rendering dynamic content...")
    response.html.render()
    nvlog.debug("Rendered dynamic content")
    session.close()
    nvlog.debug("Session closed")
    return response.html.raw_html
Exemplo n.º 15
0
 def index_request(self):
     session = requests_html.HTMLSession()
     r = session.get(
         'https://cl.cfbf.xyz/thread0806.php?fid=20&search=&page=2')
     novel_list = r.html.find(
         '#ajaxtable > tbody:nth-child(2) > tr > td.tal > h3')
     novel_list = novel_list[5:-1]
     for x in novel_list:
         title = x.text
         for reallink in x.links:
             url = 'https://cl.cfbf.xyz/' + reallink
         self.content_request(title, url)
Exemplo n.º 16
0
    def login(self, username, password):
        """
        Creates authenticated session.
        """

        self._html_session = requests_html.HTMLSession()
        self._html_session.get(url='https://api.librus.pl/OAuth/Authorization?client_id=46&response_type=code&scope=mydata')
        response = self._html_session.post(url='https://api.librus.pl/OAuth/Authorization?client_id=46',
                                           data={'action': 'login', 'login': username, 'pass': password})
        if not response.json().get('status') == 'ok' or not response.json().get('goTo'):
            raise RuntimeError("Login failed")
        self._html_session.get(url=urljoin(response.url, response.json()['goTo']))
Exemplo n.º 17
0
def main():
    with requests_html.HTMLSession() as sess:
        res = sess.get(SCRAPE_URL)

    state_links = get_state_links(res.html)
    states = list(state_links.keys())

    for state in states:
        global_storage[state] = list()

    for package in bunch(states, 5):
        aggregate_zip_codes(state_links, package)
Exemplo n.º 18
0
def regular_season(requests_date):
    with requests_html.HTMLSession() as session:
        session.headers = headers
        session.mount('http://', HTTPAdapter(max_retries=5))
        session.mount('https://', HTTPAdapter(max_retries=5))
        # 抓取的常规赛年月
        url_format = 'http://nba.win0168.com/jsData/matchResult/%s/l1_1_20%s_10.js?version=2018112112' % (
            requests_date, requests_date[:2])
        r = session.get(url_format, timeout=6)
        # 年月数据格式化
        year_month = map(lambda x: x.split(','),
                         r.html.search('ymList = [[{}]];')[0].split('],['))
        workbook = xw.Workbook('NBA.xlsx')
        worksheet = workbook.add_worksheet()
        worksheet.set_column('A:A', 15)
        worksheet.set_column('H:H', 15)
        worksheet.set_column('I:I', 15)
        worksheet.set_column('J:J', 15)
        worksheet.write(0, 0, "赛事")
        worksheet.write(0, 1, "时间")
        worksheet.write(0, 2, "队伍1")
        worksheet.write(0, 3, "队伍2")
        worksheet.write(0, 4, "第一节")
        worksheet.write(0, 5, "第二节")
        worksheet.write(0, 6, "第三节")
        worksheet.write(0, 7, "第四节")
        worksheet.write(0, 8, "第一节")
        worksheet.write(0, 9, "第二节")
        worksheet.write(0, 10, "第三节")
        worksheet.write(0, 11, "第四节")
        worksheet.write(0, 12, "半场")
        worksheet.write(0, 13, "全场")

        starkey1 = 0
        for ym in year_month:
            # if ym == ['2018', '12']:
            #     return 0
            # 新建excel format: year - month.xlsx
            url = 'http://nba.win0168.com/jsData/matchResult/%s/l1_1_%s_%s.js?version=2018112112' % (
                requests_date, ym[0], ym[1])
            r = session.get(url, timeout=6)
            # 该年月的比赛id
            play_id = map(lambda x: x.split(',')[0],
                          r.html.search('arrData = [[{}]];')[0].split('],['))
            play_id = list(play_id)
            # 当前场次后无数据 截取list
            # if ym == ['2018', '11']:
            #     play_id = play_id[:play_id.index('325827')]
            # 新建工作薄
            starkey1 = create_execl(play_id, worksheet, session, starkey1)
            # 关闭保存
            print('完成', ym[0], ym[1])
        workbook.close()
Exemplo n.º 19
0
def favicon(website):
    session = requests_html.HTMLSession()

    try:
        r = session.get(website,
                        headers={'user-agent': USER_AGENT},
                        timeout=TIMEOUT)
    except requests.exceptions.RequestException:
        r = None

    try:
        if r:
            favicon = r.html.find('link[rel="shortcut icon"]', first=True)
            if favicon is not None and "href" in favicon.attrs:
                return urllib.parse.urljoin(website, favicon.attrs['href'])

            favicon = r.html.find('link[rel="icon"]', first=True)
            if favicon is not None:
                return urllib.parse.urljoin(website, favicon.attrs['href'])
    except:
        # LATER: logging
        r = None

    favicon = urllib.parse.urljoin(website, "favicon.ico")

    try:
        r = requests.get(favicon,
                         headers={'user-agent': USER_AGENT},
                         timeout=TIMEOUT)
    except requests.exceptions.RequestException:
        return "-"

    if r.status_code == 200 and 'content-type' in r.headers and r.headers[
            'content-type'].startswith("image/"):
        return favicon

    parts = urllib.parse.urlparse(website)
    if parts.path != '/':
        favicon = urllib.parse.urlunparse(
            (parts.scheme, parts.netloc, "/favicon.ico", "", "", ""))
        try:
            r = requests.get(favicon,
                             headers={'user-agent': USER_AGENT},
                             timeout=TIMEOUT)
        except requests.exceptions.RequestException:
            return "-"

        if r.status_code == 200 and r.headers['content-type'].startswith(
                "image/"):
            return favicon

    return "-"
Exemplo n.º 20
0
def login(username,password):
        user_data={
                'account_name':username,
                'user_pwd':password,
                'remember_me':'N'
        }
        session=requests_html.HTMLSession()
        login_res=session.post("https://www.mosoteach.cn/web/index.php?c=passport&m=account_login",data=user_data)
        if(login_res.json()['result_code']==1007 or login_res.json()['result_code']==1001 ):

                return None
        else:
                return login_res.cookies
def _get_rendered_html_handler(url):
    '''
    get the rendered html version of the site -
    includes the modifications JS makes dynamically
    '''
    try:
        session = requests_html.HTMLSession()
        raw_html = session.get(url)
        raw_html.html.render()
        return BeautifulSoup(raw_html.html.html, 'html.parser')

    except requests.exceptions.ConnectionError:
        raise ValueError("Can't open playlist URL. Please check the URL again")
Exemplo n.º 22
0
    def content_request(self, title, url):
        session = requests_html.HTMLSession()
        r = session.get(url)

        content = r.html.find('#main > div:nth-child(4)')

        content = 'url:' + url + '\n' + content[0].text
        fileName = 'D:/cl/' + title + ".txt"
        if os.path.exists('D:/cl') == False:
            os.mkdir('D:/cl')
        print("正在保存小说文件:" + fileName)
        with open(fileName, "w", encoding="utf-8") as f:
            f.write(content)
Exemplo n.º 23
0
def get_page(url: str) -> Response:
    headers: str = {"User-Agent": requests_html.user_agent()}

    with requests_html.HTMLSession() as s:
        resp: Response = s.get(url, headers=headers)

        try:
            resp.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(e)
            return None

        return resp
Exemplo n.º 24
0
 def __init__(self, encoding: str = None, headers=None, cookies: str = None):
     self._session = s = requests_html.HTMLSession()
     s.mount('http://', requests.adapters.HTTPAdapter(max_retries=5))
     s.mount('https://', requests.adapters.HTTPAdapter(max_retries=5))
     self._encoding = encoding
     if headers:
         self._session.headers.update(headers)
     if cookies:
         cs = {}
         for item in cookies.split(';'):
             key, value = item.split('=', 1)
             cs[key.strip()] = value.strip()
         session.cookies = requests.cookies.cookiejar_from_dict(cs)
Exemplo n.º 25
0
def redfin(url=None):
    if url == None:
        url = 'https://www.redfin.com/stingray/api/gis-csv?al=1&market=dc&max_price=500000&min_stories=1&num_homes=350&ord=redfin-recommended-asc&page_number=1&region_id=20065&region_type=6&sf=1,2,3,5,6,7&status=9&uipt=1,2,3,4,5,6&v=8'
    #url = 'https://www.redfin.com/city/20065/MD/Upper-Marlboro/filter/max-price=500k'
    with requests_html.HTMLSession() as session:
        r = session.get(url)
        #r.html.render()
        df = pd.read_csv(url)
        df.columns = df.columns.str.strip().str.lower().str.replace(
            ' ', '_').str.replace('(', '').str.replace(')', '')

        #data = pd.read_html(r.html.html)
    return df
Exemplo n.º 26
0
 def get_full_article(self):
     """Get the full article from the url
     """
     art_sess = requests_html.HTMLSession()
     art_res = art_sess.get(self.url)
     try:
         self.date = art_res.html.find('time', first=True).attrs['datetime']
         full_text = []
         for el in art_res.html.find('.article-main-body',
                                     first=True).find('p'):
             full_text.append(el.text)
         self.text = '\n'.join(full_text)
     except KeyError as e:
         print(f'KeyError: {e}')
Exemplo n.º 27
0
    def __init__(self, mlock, iqueue, oqueue, max_retry=3):
        super(GetWordProcess, self).__init__()
        self.lock = mlock
        self.iqueue = iqueue
        self.oqueue = oqueue
        self.lhp = LexicoHTMLParser()
        self.words_to_get = list()
        self.sess = requests_html.HTMLSession()
        self.max_retry = max_retry
        self.max_sleep = 3
        self.t_sleep = 20

        self.words_failed = dict()
        self.words_collected = list()
Exemplo n.º 28
0
async def lucario(ctx):
    """posts a random (safe) Lucario image from e621"""
    response = requests_html.HTMLSession().get(
        "https://e621.net/posts.json?"
        "tags=lucario+rating:safe+score:%3E=50+-type:mp4+-type:swf+-type:webm+-type:zip+order:random&limit=1"
    )
    json = response.json()
    post = next(iter(json["posts"]))
    id = post["id"]
    url = post["file"]["url"]
    embed = discord.Embed(title = "a wild lucario appeared!")
    embed.set_image(url = post["file"]["url"])
    embed.set_author(name = post['id'], icon_url = "https://e621.net/favicon.ico")
    await ctx.send(embed = embed)
 def handle(self, *args, **options):
     session = requests_html.HTMLSession()
     r = session.get(CCEW_DATA_URL)
     for d in r.html.find("blockquote.download"):
         for p in d.find("p"):
             if "Charity register extract" in p.text:
                 links = p.absolute_links
                 for link in links:
                     f, created = CcewDataFile.objects.update_or_create(
                         title=d.find("h4", first=True).text,
                         defaults=dict(url=link, description=p.text),
                     )
                     print("{} ({})".format(
                         f, "created" if created else "updated"))
Exemplo n.º 30
0
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word, config.no_skip_spaces_word) or _skip_date(
            date, config.cut_off_date
        ):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            # Pronunciation processing is done in NFD-space;
            # we convert back to NFC aftewards.
            yield word, unicodedata.normalize("NFC", pron)