Exemplos de HTMLSession em Python, exemplos de requests_html.HTMLSession em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def get_player_stats():
    player_stats = []
    composition_stats = []
    player_hero_stats = []
    session = HTMLSession()
    res = session.get(player_stats_url())
    map_divs = res.html.find('.map-wrapper')
    table_divs = res.html.find('.side-by-side-stats')
    category = 'Allmaps'
    for div in res.html.find('.match-div > div'):
        if 'map-wrapper' in div.attrs.get('class', []):
            map_name = div.find(
                '.label-info', first=True).text.lower().replace(' ', '_')
        elif 'side-by-side-stats' in div.attrs.get('class', []):
            composition_stat, hero_stat = parse_overall_hero_stat_div(
                div, category=category, map_name=map_name)
            composition_stats += composition_stat
            player_hero_stats += hero_stat
            player_stats += parse_overall_stat_div(
                div, category=category, map_name=map_name)
        else:
            category = div.text
    write_json('stats/composition_stats.json', composition_stats)
    write_json('stats/player_hero_stats.json', player_hero_stats)
    write_json('stats/player_stats.json', player_stats)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: gen.py Projeto: Arsenalist/Instagram-Recent-Media-Scraper

def get_media(user):
    ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    headers = {
        'User-Agent': 'My User Agent 1.0',
    }

    proxies = {
      'http': 'http://1.20.102.177:30106',
      'https': 'https://1.20.102.177:30106',
    }   
    url = 'https://www.instagram.com/' + user
    session = HTMLSession()
    req = session.get(url, headers=headers, proxies=proxies)
    
    media = []
    scripts = req.html.xpath('//script[@type]')    
    for s in scripts:
        content = s.text
        if "csrf_token" in content:
            content = content[:-1].split("window._sharedData = ")[1]      
            data = json.loads(content)     
            recent_media = data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
            for r in recent_media:
                media.append({
                    "username": data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["username"],
                    "image": r["node"]["thumbnail_src"],
                    "timestamp": r["node"]["taken_at_timestamp"],
                    'permalink': r["node"]["display_url"],
                    'caption': r["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"],
                    'shortcode': r["node"]["shortcode"]
                })
    return media

Exemplo n.º 3

0

Exibir arquivo

Arquivo: explain_scraper.py Projeto: DeNeutoy/crosswords

def main(dir: str):

    files = [x for x in os.listdir(dir) if x.lower()[-4:] == "json"]
    session = HTMLSession()
    os.makedirs(f"{dir}_solutions", exist_ok=True)
    for crossword_file in files:

        crossword = json.load(open(os.path.join(dir, crossword_file)))

        timestamp = crossword["dateSolutionAvailable"]
        year, month, day = parse_timestamp(timestamp)
        number = crossword["number"]
        crossword_type = crossword["crosswordType"]

        url = f"https://www.theguardian.com/crosswords/{year}/{month}/{day}/annotated-solutions-for-{crossword_type}-{number}"
        print(crossword["solutionAvailable"], url)
        result = session.get(url)
        if result.status_code >= 300:
            continue
        html = result.html
        relevant_divs = html.find("div.content__main-column.content__main-column--article.js-content-main-column")
        if len(relevant_divs) != 1:
            print(relevant_divs)

        solutions = [x.text for x in relevant_divs[0].find("p") if x.text]

        parsed = parse_solutions(solutions)
        save_name = os.path.join("crosswords/prize_solutions", f"{number}_solution.json")

        with open(save_name, "w+") as file:
            json.dump(parsed, file, indent=4)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: timeanddate.py Projeto: wdv4758h/rcfiles

def show_datetime_for(name, url):
    session = HTMLSession()
    r = session.get(url)
    # '03:37:58'
    time = r.html.find('#ct', first=True).text
    # 'PDT'
    timezone = r.html.find('#cta', first=True).text
    # 'Saturday, 16 June 2018'
    date = r.html.find('#ctdat', first=True).text

    print(f'{name:12}: {time} {date} {timezone}')

Exemplo n.º 5

0

Exibir arquivo

Arquivo: mensa.py Projeto: enplotz/mensa_ukon

class MensaBase(object):

    def __init__(self, endpoints, location):
        """Constructor."""
        self.location = location
        # dict of language specific endpoints
        # { Language : url-string }
        self.endpoints = endpoints

        adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1))
        self.session = HTMLSession()
        self.session.mount('https://', adapter)

    def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan:
        # overwrite this
        # TODO how to make design more pythonic?
        # In Java terms: abstract class -> two implementation classes
        pass

    # Helper method to make a language-specific request
    def do_request(self, language=Language.DE):
        resp = self.session.get(self.endpoints[language.name])
        code = resp.status_code
        if code != 200:
            logger.warning(f'Non-200 status: {code}')
        logger.debug(f'Status Code: {code}')
        return resp.html

    @staticmethod
    def _normalize_key(k: str) -> str:
        return None if not k else k.strip().lower().replace(' ', '_')

    @staticmethod
    def _strip_additives(text: str) -> str:
        return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text)

    @staticmethod
    def _normalize_whitespace(text: str) -> str:
        return re.sub('\s{2,}', ' ', text)

    @staticmethod
    def _normalize_orthography(text: str) -> str:
        return re.sub('\s,', ',', text)

    @staticmethod
    def _clean_text(text: str) -> str:
        return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip())))

    @staticmethod
    def _text_replace(text: str) -> str:
        return re.sub('Züricher', "Zürcher", text)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: tanke_og_teknikk.py Projeto: universitas/tassen

def fetch_ads(url='http://tankeogteknikk.no/qmedia/oslo.php'):
    """Crawl tankeogteknikk web site and fetch current ads"""
    try:
        r = HTMLSession().get(url)
    except RequestError:
        logger.exception('failed to fetch ads')
        return []

    r.raise_for_status()  # raise exception if 404 or other non ok http status
    subs = r.html.find('table.sub')
    ads = [_parse_sub_advert(sub) for sub in subs]
    for ad in ads:
        # use absolute and quoted urls
        ad['image'] = parse.quote(parse.urljoin(url, ad['image']), safe='/:')
    return ads

Exemplo n.º 7

0

Exibir arquivo

Arquivo: import_operator_twitter.py Projeto: jclgoodwin/bustimes.org.uk

    def handle(cls, *args, **kwargs):
        session = HTMLSession()

        for operator in Operator.objects.filter(service__current=True, twitter='').exclude(url='').distinct():
            try:
                r = session.get(operator.url, timeout=10)
            except RequestException:
                operator.url = ''
                operator.save()
                continue
            for link in r.html.links:
                twitter = cls.get_from_link(link)
                if twitter:
                    operator.twitter = twitter
                    operator.save()
                    break

Exemplo n.º 8

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def get_teams_and_matches():
    session = HTMLSession()
    res = session.get(owl_index_url())
    res.html.render(timeout=60)
    match_rows = res.html.find(
        '.tab-pane#past')[0].find('table')[0].find('.past-matches-row')
    updated = True
    # TODO get match data in future
    # for row in match_rows:
    #     if parse_match_row(row):
    #         updated = True
    if updated:
        teams = {td.text: td.absolute_links.pop()
                 for td in res.html.find('td.team')}
        write_json('stats/team_hero_stats.json',
                   [parse_team(team_name, team_url) for team_name, team_url in teams.items()])

Exemplo n.º 9

0

Exibir arquivo

Arquivo: hydro_chmi_cz.py Projeto: jirikadlec2/hydrodata

def fetch_hpps_streamflow(dst_dir, url=None):
    """
    Fetch streamflow data from chmi fetch_hpps_data
    """
    session = HTMLSession()
    n_charts = 0

    datatype_prefix = 'streamflow'
    agency = 'chmi'

    pagesize = 50
    n_pages = 20

    for page in range(0, n_pages):
        subpage_url = "http://hydro.chmi.cz/hpps/hpps_oplist.php?startpos={0}&recnum={1}".format(page*pagesize, pagesize)
        print("----------------------------------------------------")
        print(subpage_url)
        print("----------------------------------------------------")
        session = HTMLSession()
        r = session.get(subpage_url)
 
        for lnk in r.html.absolute_links:
            if 'prfdyn' in lnk:
                print(lnk)
                
                station_seq = lnk.split('=')[-1]
                print(station_seq)

                data_dir = dst_dir / datatype_prefix / agency / station_seq
                if not os.path.exists(data_dir):
                    os.makedirs(data_dir)
                utc_timestamp_text = datetime.utcnow().strftime('%Y-%m-%dT%H0000z.html')

                html_filename = "prfdata_" + station_seq + "_" + utc_timestamp_text
                html_path = data_dir / html_filename

                # save the HTML with seven-day table
                lnk_table = lnk.replace('prfdyn', 'prfdata')
                print(lnk_table)
                html_response = get(lnk_table)
                if html_response.status_code == 200:
                    print(html_path)
                    with open(html_path, 'wb') as f:
                        f.write(html_response.content)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: mm_req_html_pic.py Projeto: wahlmzr/craw

 def __init__(self):
     self.__page = 1
     self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
     self.__session = HTMLSession()
     self.__headers = {
         'Referer':'http://www.mm131.com/qingchun/',
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
     }
     self.__imagePath = r'D:/Photo/MM'
     self.__confirmPath()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: mensa.py Projeto: enplotz/mensa_ukon

    def __init__(self, endpoints, location):
        """Constructor."""
        self.location = location
        # dict of language specific endpoints
        # { Language : url-string }
        self.endpoints = endpoints

        adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1))
        self.session = HTMLSession()
        self.session.mount('https://', adapter)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: mm_req_html_pic.py Projeto: wahlmzr/craw

class MM(object):
    def __init__(self):
        self.__page = 1
        self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
        self.__session = HTMLSession()
        self.__headers = {
            'Referer':'http://www.mm131.com/qingchun/',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        self.__imagePath = r'D:/Photo/MM'
        self.__confirmPath()

    def __confirmPath(self):
        if not os.path.exists(self.__imagePath):
            os.makedirs(self.__imagePath)
            
    def download(self,link,fileName):
        try:
            with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f:
                f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content)
        except Exception as e:
            print(str(e))

    def parseData(self):
        start = time.time()
        while self.__page < 12:
            if self.__page == 1:
                self.__url = "http://www.mm131.com/qingchun/"
            else:
                self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page)
            r = self.__session.get(self.__url)
            main = r.html.find(".main",first=True)
            dl = main.find('dl')[0]
            dds = dl.find('dd')
            for dd in dds[:-1]:
                attr = dd.find('img')[0].attrs
                imageLink = attr['src']
                title = attr['alt']
                self.download(imageLink,title)
            self.__page += 1
        end = time.time() - start
        print("爬取时间:",end)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def get_hero_stats():
    hero_stats = []
    session = HTMLSession()
    res = session.post(hero_stats_url(), data={
                       'event[]': 86, 'teamcompTypes': 1})
    player_heros = []
    team_heros = []

    # ['gameNumber', 'roundtype', 'player', 'team', 'hero',
    # 'timePlayed', 'matchID', 'playerPic', 'playerName', 'teamPic',
    # 'nameCSFriendly', 'map', 'teamName']
    for result in res.html.search_all("heroStatsArr.concat({})"):
        player_heros += json.loads(result[0])

    # keys = ['gameNumber', 'roundtype', 'team', 'tcString',
    # 'gameWasPlayed', 'map', 'maptype', 'timePlayed', 'matchID']
    for result in res.html.search_all("teamcompsArr.concat({})"):
        team_heros += json.loads(result[0])
    write_json('stats/player_heros.json', player_heros)
    write_json('stats/team_heros.json', team_heros)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: crawl_campus_mz.py Projeto: wahlmzr/craw

    def __init__(self,url):
        # 重写父类的__init__方法
        super(XHSpider, self).__init__()
        self.url = url

        self.session = HTMLSession()
        self.headers = {
            'Host':'news.daxues.cn',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        self.path = "D:/Photo/"
        self.check_file_path(self.path)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def parse_match_row(row):
    match_path = os.path.join(
        'stats', 'matches', row.attrs['matchid'] + '.json')
    if os.path.exists(match_path):
        return False
    match = {}
    session = HTMLSession()
    match_res = session.get(match_url(row.attrs['matchid']))
    render_result = match_res.html.render(timeout=600)
    print(render_result)
    team_names = [{'name': team_name_div.text,
                   'id': team_name_div.links.pop().split('id=')[-1]} for team_name_div in match_res.html.find('.names-and-score', first=True).find('div')[1::2]]
    maps = []
    for map_div in match_res.html.find('.map-wrapper'):
        map_data = {'name': map_div.find(
            '.mapname', first=True).text, 'teams': []}
        mapping = {'name': 3, 'score': 4,
                   'progress': 5, 'fights': 6, 'kills': 7}
        for i in range(1, 3):
            team_data = {}
            for key, index in mapping.items():
                team_data[key] = map_div.find('div')[index].text.split('\n')[i]
            map_data['teams'].append(team_data)
        maps.append(map_data)
    stat_divs = match_res.html.find('.side-by-side-stats')
    overall_stats = parse_stat_div(stat_divs.pop(0))
    for i, map_stat_div in enumerate(stat_divs):
        maps[i]['stats'] = parse_stat_div(map_stat_div)

    hero_stats = parse_hero_stat_div(match_res.html.find(
        '#allMapsAllRoundsAllTeams', first=True))
    hero_stats_by_team = []
    # TODO FIX the script problem
    # for team in team_names:
    #     hero_stats_by_team.append(parse_hero_stat_div(match_res.html.find(
    #         '#allMapsAllRoundsTeam' + team['id'], first=True)))
    write_json(match_path, {'maps': maps, 'stats': overall_stats, 'hero_stats': hero_stats,
                            'hero_stats_by_team': hero_stats_by_team,
                            'teams': team_names, 'date': row.find('td')[0].text})
    return True

Exemplo n.º 16

0

Exibir arquivo

Arquivo: scraper.py Projeto: DeNeutoy/crosswords

def main(crossword_types: List[str]):

    session = HTMLSession()
    for crossword_type in crossword_types:

        if crossword_type not in CROSSWORDS.keys():
            raise ValueError(f"crosword type must be in one of {CROSSWORDS.keys()}")
        start, end = CROSSWORDS[crossword_type]

        os.makedirs(f"crosswords/{crossword_type}", exist_ok=True)
        for crossword_no in reversed(range(start, end)):

            try:
                url = "https://www.theguardian.com/crosswords/" + crossword_type + "/" + str(crossword_no)
                result = session.get(url)
                if result.status_code >= 300:
                    continue
                html = result.html
                try:
                    relevant_divs = html.find("div.js-crossword")
                    if len(relevant_divs) != 1:
                        print(relevant_divs)
                    clues = relevant_divs[0].attrs["data-crossword-data"]
                except:
                    relevant_divs = html.find("div.js-crossword has-grouped-clues")
                    if len(relevant_divs) != 1:
                        print(relevant_divs)
                    clues = relevant_divs[0].attrs["data-crossword-data"]

                clues_json = json.loads(clues)
                save_name = clues_json["id"] + ".json"

                with open(save_name, "w+") as file:
                    json.dump(clues_json, file, indent=4)
            except IndexError:
                print("couldn't find crossword no:{}".format(crossword_no))
                with open("crosswords/" + crossword_type + "/missing_ids.txt", "a+") as file:
                    file.write(str(crossword_no) + "\n")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def get_event_player_rank():
    session = HTMLSession()
    res = session.get(player_rank_url())
    table = res.html.find('table.ranking-table', first=True)
    player_ranks = []
    hero_ranks = []
    for tr in table.find('tr')[2:]:

        overall_rank = int(tr.find('td.rank', first=True).text)
        overall_rating = int(tr.find('.rating-number', first=True).text)
        team_name = tr.find('.small-team-logo',
                            first=True).attrs['title'].split(': ')[-1]
        stars = int(tr.find(
            '.star-rating', first=True).attrs['class'][-1].replace('star', '').split('-')[0])
        info_div, heros_div = tr.find('.team-info-td > div')
        name = info_div.find('a', first=True).text
        time, sos_rank, win_percent = [div.text.split(
            ': ')[-1] for div in info_div.find('.secondary-stats')]
        rank_data = {'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name,
                     'stars': stars, 'name': name, 'time': time, 'sos_rank': int(sos_rank),
                     'win_percent': win_percent, 'hero_ranks': []}
        for span in heros_div.find('span.secondary-ranking'):
            hero_name = span.attrs['title'].split(' Rank:')[0].lower()
            hero_rank_by_total, hero_rating, hero_time, hero_win_percent = [
                text.split(': ')[-1] for text in span.attrs['title'].split('\n')]
            hero_rank, total_count = hero_rank_by_total.split('/')
            hero_rank_data = {'hero_name': hero_name, 'hero_rank_by_total': hero_rank_by_total,
                              'hero_rating': int(hero_rating), 'hero_time': hero_time,
                              'hero_win_percent': hero_win_percent, 'hero_rank': int(hero_rank),
                              'total_count': int(total_count), 'name': name, 'overall_rank': overall_rank,
                              'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, }
            hero_ranks.append(hero_rank_data)
            rank_data['hero_ranks'].append(hero_rank_data)
        player_ranks.append(rank_data)
    write_json('stats/player_ranks.json', player_ranks)
    write_json('stats/hero_ranks.json', hero_ranks)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: scan.py Projeto: StarsHu/ShestakUI_Filger

def get(spell_id):
    session = HTMLSession()
    r = session.get("https://cn.wowhead.com/spell={}".format(spell_id))
    en_url = r.html.find('link[hreflang="en"]', first=True).attrs['href']
    zh_url = r.url
    return Spell(spell_id, parse(en_url), parse(zh_url))

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_requests_html.py Projeto: 666King999/requests-html

import os

from requests_html import HTMLSession, HTML
from requests_file import FileAdapter

session = HTMLSession()
session.mount('file://', FileAdapter())


def get():
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return session.get(url)


def test_file_get():
    r = get()
    assert r.status_code == 200


def test_css_selector():
    r = get()

    about = r.html.find('#about', first=True)

    for menu_item in (
        'About', 'Applications', 'Quotes', 'Getting Started', 'Help',
        'Python Brochure'
    ):
        assert menu_item in about.text.split('\n')

Exemplo n.º 20

0

Exibir arquivo

Arquivo: voda_gov_cz.py Projeto: jirikadlec2/hydrodata

def fetch_vodagov_charts(dst_dir, agency, base_url, subpages, datatype_prefix):
    """
    Fetch graphs and html tables from voda.gov.cz
    fetch_vodagov_charts(dst_dir='/home/jiri/meteodata',
                         agency_prefix='pod',
                         base_url='http://www.pvl.cz/portal/SaP/pc/?',
                         subpages=['oid=1', 'oid=2'],
                         datatype_prefix='streamflow',
                         agency_prefix='pod')

    :param dst_dir: destination directory where to save the data (subdirs are created automatically)
    :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow,
                                               http://www.pvl.cz/portal/srazky/pc/? for precipitation]
    :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3'])
    :param datatype_prefix: the data type. use 'streamflow' or 'precip'
    :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo
    :return: number of charts and html pages downloaded
    """

    #if datatype_prefix == 'streamflow':
        #pvl_base = 'http://sap.poh.cz/portal/SaP/pc/?'
    #else:
        #pvl_base = 'http://sap.poh.cz/portal/Srazky/PC/?'

    session = HTMLSession()
    n_charts = 0

    for subpage in subpages:

        url = base_url + subpage
        print('-----------------------------')
        print(url)
        print('-----------------------------')
        r = session.get(url)

        for lnk in r.html.absolute_links:
            if 'Mereni.aspx?id=' or 'mereni.aspx?id=' in lnk:

                try:

                    r_st = session.get(lnk)

                    images = r_st.html.find('img')
                    for img in images:
                        if 'src' not in img.attrs:
                            continue
                        src = img.attrs['src']
                        if ('graf' in src or 'Graf' in src) and ('miniatury' not in src) and ("&" not in src) and (".ashx" not in src):

                            if 'maska' in src:
                                continue

                            img_src_absolute = urljoin(lnk, src)

                            img_response = get(img_src_absolute)
                            if img_response.status_code == 200:

                                img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(img_src_absolute))[0])
                                if not os.path.exists(img_dir):
                                    os.makedirs(img_dir)
                                utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.png')

                                img_filename = os.path.basename(img_src_absolute).replace('.png', utc_timestamp_text)

                                img_path = os.path.join(img_dir, img_filename)
                                print(img_path)
                                with open(img_path, 'wb') as f:
                                    f.write(img_response.content)

                                # also save the HTML
                                html_path = img_path.replace('.png', '.html')
                                html_response = get(lnk)
                                if html_response.status_code == 200:
                                    print(html_path)
                                    with open(html_path, 'wb') as f:
                                        f.write(html_response.content)

                            n_charts += 1

                except ValueError:
                    print('ERROR fetching ' + lnk)
    return n_charts

Exemplo n.º 21

0

Exibir arquivo

from requests_html import HTMLSession, HTMLResponse
import urllib.request

session = HTMLSession()  #creates obj of htmlsession n stores into session
urls = ['http://books.toscrape.com/catalogue/page-1.html']

for i in range(1, 2):
    urls.append('http://books.toscrape.com/catalogue/page-{i}.html')
for url in urls:
    response = session.get(url)
    #print(response)
    #print(response.text)

    source = response.html  #store html file in response
    #print(type(source))
    #print(source)       #gives url
    #print(source.html)

    block = source.find('ol.row', first=True)  #first=True  ==> block[0]
    #print(block)

    names = block.find('li h3 a', first=True)
    print(names.attrs['title'])
    #print(names.text)........ gives the content of <a> tag

    #names = block.find('li h3 a',first=True)
    #print(names.attrs['href'])

    titles = []
    cost = []

Exemplo n.º 22

0

Exibir arquivo

                    format=LOG_FORMAT)
logger = logging.getLogger()

# Test the logger
# logger.info("Our first message!")

# In[3]:

# pip3 install requests-html
from requests_html import HTMLSession
import re  #import regular expression package 正则表达式模块
import zipfile
from zipfile import ZipFile
import os

session = HTMLSession()

# input&read
logger.debug("# Read the CIK and acc_no from console")
cik = input("Please input the CIK：")
acc_no_test = input("Please input the document accession number:")
print('The CIK and Acc_no you entered is:', cik, acc_no_test)

# CIK = '51143'
# acc_no = '000005114313000007/0000051143-13-000007'
logger.debug("# Get the HTML page")
CIK = cik
acc_no = acc_no_test
html_tail = '-index.html'
url_company = "http://www.sec.gov/Archives/edgar/data/" + CIK + "/" + acc_no + html_tail

Exemplo n.º 23

0

Exibir arquivo

Arquivo: bas_website.py Projeto: durguupi/python_devops

from requests_html import HTML, HTMLSession

session = HTMLSession()

req = session.get('https://httpbin.org')

# print(req.status_code)
# print(req.text)

html = req.html

title = html.find('title', first=True)
print(title.text)
print("==============================================")

wrappers_first = html.find('div.wrapper', first=True)
print(wrappers_first.html)

# print("==============================================")

# info = html.find('div.info')
# # print(info)

# print("==============================================")
# for inf in info:
#     info_head = inf.find('h2', first=True)
#     print(info_head.text)

# wrappers = html.find('div.wrapper')
# print(wrappers)
# print("==============================================")

Exemplo n.º 24

0

Exibir arquivo

Arquivo: qfx.py Projeto: poudel/qfx-scraper

 def __init__(self):
     self.showing = None
     self.coming_up = None
     self.session = HTMLSession()

Exemplo n.º 25

0

Exibir arquivo

Arquivo: get_anaconda_packages.py Projeto: kingjosephm/python_modules

import os
from datetime import date
from requests_html import HTMLSession
from bs4 import BeautifulSoup

if __name__ == '__main__':

    # Scrape Anaconda main page for latest version of Python, with link to list
    session = HTMLSession()
    url = 'https://docs.anaconda.com/anaconda/packages/pkg-docs/'
    r = session.get(url, verify=False)
    links = list(r.html.absolute_links)

    link = ''.join(sorted([
        i for i in links if "win-64" in i
    ])[-1])  # search for latest win-64 version package list and save link

    # Scrape website with Anaconda package list using latest list
    r = session.get(link, verify=False)

    # Get the table of packages
    bs = BeautifulSoup(r.text, "lxml")
    tr_elements = bs.find_all('table')

    # Reduce to package names, removing HTML formatting strings
    for row in tr_elements:
        elements = row.find_all('td')
        elements = [x.text.strip() for x in elements]

    # Every 4th item is package name in table
    pkgs = []

Exemplo n.º 26

0

Exibir arquivo

以下代码是用requs_html这个库去搜索壁纸并下载
安装命令：pip install requests-html
'''

from requests_html import HTMLSession
import requests

# 保存图片到picture/目录
def save_image(url, title):
    img_response = requests.get(url)
    with open('./picture/'+title+'.jpg', 'wb') as file:
        file.write(img_response.content)

# 背景图片地址，这里选择1920*1080的背景图片
url = "http://www.win4000.com/wallpaper_2358_0_10_1.html"

session = HTMLSession()
r = session.get(url)

# 查找页面中背景图，找到链接，访问查看大图，并获取大图地址
items_img = r.html.find('ul.clearfix > li > a')
for img in items_img:
    img_url = img.attrs['href']
    if "/wallpaper_detail" in img_url:
        r = session.get(img_url)
        item_img = r.html.find('img.pic-large', first=True)
        url = item_img.attrs['src']
        title = item_img.attrs['title']
        print(url+title)
        save_image(url, title)

Exemplo n.º 27

0

Exibir arquivo

class F5Downloads:
    def __init__(self, username, password, default_location='IRELAND'):
        self.username = username
        self.password = password
        self.default_location = default_location
        self._session = None
        self._version_pages = None
        self.new_files = []

    @property
    def session(self):
        if not self._session:
            self._session = HTMLSession()
            self._session.post(
                'https://api-u.f5.com/auth/pub/sso/login/user',
                headers={'Content-Type': 'application/x-www-form-urlencoded'},
                data={
                    'userid': self.username,
                    'passwd': self.password,
                })
        return self._session

    def find_links(self, page, pattern):
        return [(l.text, next(iter(l.absolute_links)))
                for l in page.html.find('a')
                if l.text and l.absolute_links and re.match(pattern, l.text)]

    def follow_specific_link(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']

        matching_links = self.find_links(page, pattern)

        # To proceed in the chain we need exactly one match
        if len(matching_links) != 1:
            logger.error(
                'Found {len(matching_links)} matches for url {url} and pattern {pattern}, unable to proceed'
            )
            logger.error('Files found:')
            logger.error(matching_links)
            raise Exception(f'')

        name, url = matching_links[0]
        logger.debug(f'Following {name} with {url}')
        return self.get_page(url)

    def pick_latest_version(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']

        matching_links = self.find_links(page, pattern)

        if not len(matching_links):
            raise Exception(
                f'No versions matching {pattern} found on page {page}')

        versionDict = {}

        # This is an ugly one. Threat the versions as a decimal number and increase the worth
        # of each version number by a factor of 10, then return the sum
        for version, url in matching_links:
            number = version.replace('.', '')
            versionDict[number] = (version, url)

        # Pick the highest number
        version, url = versionDict[max(versionDict, key=int)]
        logger.debug(f'Picking {version} as latest version')

        return self.get_page(url)

    def follow_path(self, page, steps):

        step = steps.pop(0)
        f = step['f']
        args = step['args'] | {'page': page}

        result = f(**args)

        if not len(steps):
            return result
        elif result:
            return self.follow_path(result, steps)

    # Detect if the EULA exists and circle around it
    def get_page(self, url):
        page = self.session.get(url)
        if len(page.html.find('input#accept-eula')):
            logger.debug('EULA encountered, accepting it')
            page = self.session.get(
                url.replace('https://downloads.f5.com/esd/ecc.sv',
                            'https://downloads.f5.com/esd/eula.sv'))
        return page

    def download_files(self, **kwargs):
        page = kwargs['page']
        pattern = kwargs['pattern']
        download_folder = kwargs['download_folder']
        cb = kwargs['cb']

        # Create folders if needed
        pathlib.Path(download_folder).mkdir(parents=True, exist_ok=True)

        matching_links = self.find_links(page, pattern)

        for name, url in matching_links:
            md5_name, md5_url = next(
                iter(self.find_links(page, rf'^{name}.md5$')), (None, None))

            # Only download if there's a matching md5 file
            if not md5_name:
                raise Exception(f'No matching md5 file found for {name}')

            file_path = f'{download_folder}{name}'
            md5_path = f'{download_folder}{md5_name}'
            self.download_file(md5_path, md5_url)

            if self.md5_sum_ok(md5_path, file_path):
                logger.info('The newest file already exists on disk')
                return file_path
            else:
                self.download_file(file_path, url)
                logger.info(f'Validating {name} against the supplied    md5')
                if self.md5_sum_ok(md5_path, f'{download_folder}{name}'):
                    logger.info('Downloaded file successfully')
                    if cb:
                        cb(file_path)
                    return (file_path)
                else:
                    raise Exception(f'Failed to download file {name}')

    def md5_sum_ok(self, md5_file, file):
        if not os.path.exists(md5_file):
            raise Exception(f'{md5_file} does not exist')
        if not os.path.exists(file):
            logger.info(f'{file} does not exist')
            return False
        with open(md5_file, 'r') as f:
            md5sum = re.sub(r' .+\n$', '', f.read())
        file_sum = self.md5(file)

        return md5sum == file_sum

    def md5(self, file_name):
        hash_md5 = hashlib.md5()
        with open(file_name, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()

    def download_file(self, file_path, url):
        if os.path.exists(file_path):
            os.remove(file_path)
        page = self.get_page(url)
        name, download_url = next(
            iter(self.find_links(page, rf'{self.default_location}')),
            (None, None))
        if (download_url):
            logger.debug(f'Saving file as ./{file_path}')
            with self.session.get(download_url, stream=True) as r:
                r.raise_for_status()
                with open(file_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

    def download_geoipdb(self, version, cb=None):
        return self.follow_path(
            self.get_page('https://downloads.f5.com/esd/productlines.jsp'),
            [{
                'f': self.follow_specific_link,
                'args': {
                    'pattern': rf'BIG-IP v{version}.x.+'
                },
            }, {
                'f': self.follow_specific_link,
                'args': {
                    'pattern': r'GeoLocationUpdates',
                }
            }, {
                'f': self.download_files,
                'args': {
                    'pattern': rf'^ip-geolocation-.+\.zip$',
                    'download_folder': f'./downloads/GeoIP/v{version}/',
                    'cb': cb
                }
            }])

    def download_latest_version(self, version, cb=None):
        return self.follow_path(
            self.get_page('https://downloads.f5.com/esd/productlines.jsp'),
            [{
                'f': self.follow_specific_link,
                'args': {
                    'pattern': rf'BIG-IP v{version}.x.+'
                },
            }, {
                'f': self.pick_latest_version,
                'args': {
                    'pattern': rf'^{version}[\.0-9]+$',
                }
            }, {
                'f': self.download_files,
                'args': {
                    'pattern': rf'^BIGIP-{version}[\.0-9]+.+iso$',
                    'download_folder': f'./downloads/BIG-IP/v{version}/',
                    'cb': cb
                }
            }])

Exemplo n.º 28

0

Exibir arquivo

 def __init__(self):
     self.session = HTMLSession()
     self.x_guest_token = None
     self.headers = {"User-Agent": USER_AGENT}

Exemplo n.º 29

0

Exibir arquivo

class TwitterScrap:
    def __init__(self):
        self.session = HTMLSession()
        self.x_guest_token = None
        self.headers = {"User-Agent": USER_AGENT}

    def get_profile(self, username):
        profile = Profile()
        profile.profile_url = f"https://twitter.com/{username}/"

        self.__get_token(profile.profile_url)

        self.headers["x-guest-token"] = self.x_guest_token
        self.headers[
            "Authorization"] = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"

        self.headers["Referer"] = profile.profile_url

        params = {
            'variables':
            json.dumps({
                'screen_name': username,
                'withHighlightedLabel': True
            },
                       separators=(',', ':'))
        }

        prepared_request = self.session.prepare_request(
            requests.Request(
                "GET",
                "https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName",
                headers=self.headers,
                params=urllib.parse.urlencode(params,
                                              quote_via=urllib.parse.quote)))
        res = self.session.send(prepared_request,
                                allow_redirects=True,
                                timeout=15)
        profile.parse_profile(res.json())
        return profile

    def get_tweets(self,
                   profile: Profile,
                   include_replies=False,
                   include_rt=False,
                   count=40):
        prepared_request = self.session.prepare_request(
            requests.Request(
                "GET",
                f"https://twitter.com/i/api/2/timeline/profile/{profile.id}.json",
                headers=self.headers,
                params=urllib.parse.urlencode(get_params(
                    profile.id, include_replies, count),
                                              quote_via=urllib.parse.quote)))
        res = self.session.send(prepared_request,
                                allow_redirects=True,
                                timeout=10)
        if res.status_code != 200:
            raise Exception(
                f"Could not get tweets, status_code {res.status_code}")

        data = res.json()
        tweets = parse_tweets(data, include_rt, profile.id)
        return tweets

    def __get_token(self, url):
        if self.x_guest_token == None:
            request = self.session.prepare_request(
                requests.Request("GET", url, headers=self.headers))
            res = self.session.send(request, allow_redirects=True, timeout=20)
            possible_token = re.search(
                r'decodeURIComponent\("gt=(\d+); Max-Age=10800;', res.text)
            if possible_token:
                self.x_guest_token = possible_token.group(1)
                self.session.cookies.set("gt",
                                         self.x_guest_token,
                                         domain='.twitter.com',
                                         path='/',
                                         secure=True,
                                         expires=time.time() + 10800)
            else:
                raise Exception(
                    f"Could not retrieve guest token, status_code {res.status_code}"
                )
        else:
            return

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_requests_html.py Projeto: oijqws/test

import os
from functools import partial

import pytest
import psutil
from pyppeteer.browser import Browser
from pyppeteer.page import Page
from requests_html import HTMLSession, AsyncHTMLSession, HTML
from requests_file import FileAdapter

session = HTMLSession()
session.mount('file://', FileAdapter())


def get():
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return session.get(url)


@pytest.fixture
def async_get(event_loop):
    """ AsyncSession cannot be created global since it will create
        a different loop from pytest-asyncio. """
    async_session = AsyncHTMLSession()
    async_session.mount('file://', FileAdapter())
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return partial(async_session.get, url)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: AmazonCommentsAnalysis.py Projeto: JYS333/AmazonCommentsAnalysis

def scrape(url):
    """
    Scrape comments from Amazon.com, write them(comments and rating) into a csv file
    :param url: the link from Amazon
    :return: None
    """
    hs = HTMLSession()

    try:
        url = url.replace("dp", "product-reviews")
    except Exception as e:
        print(e)
        quit()

    r = hs.get(
        url=url,
        headers={
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
            'sec-fetch-dest':
            'document',
            'sec-fetch-mode':
            'navigate',
            'sec-fetch-site':
            'none',
            'sec-fetch-user':
            '******',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
        })

    comments = r.html.find('div.a-section.review.aok-relative')

    fw = open('reviews.csv', 'a', encoding='utf8')  # output file
    writer = csv.writer(fw, lineterminator='\n')

    for a in comments:

        comment, star = 'NA', 'NA'  # initialize critic and text

        commentChunk = a.find(
            'span.a-size-base.review-text.review-text-content > span')
        if commentChunk: comment = commentChunk[0].text.strip()

        starChunk = a.find('i > span.a-icon-alt')
        if starChunk: star = starChunk[0].text.strip()

        # star = a.find('i > span.a-icon-alt')[0].text
        # comment = a.find('span.a-size-base.review-text.review-text-content > span')[0].text

        writer.writerow([comment, star])

    fw.close()
    sleep(.75)
    pagination(r)

    r.close()

Exemplo n.º 32

0

Exibir arquivo

Arquivo: twitter_scraper.py Projeto: threatlevelmidnight10/IBM_Mood_Analysis

import re
from requests_html import HTMLSession, HTML
from datetime import datetime

session = HTMLSession()


def get_tweets(user, pages=25):
    """Gets tweets for a given user, via the Twitter frontend API."""

    url = f'https://twitter.com/i/profiles/show/{user}/timeline/tweets?include_available_features=1&include_entities=1&include_new_items_bar=true'
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': f'https://twitter.com/{user}',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
        'X-Twitter-Active-User': '******',
        'X-Requested-With': 'XMLHttpRequest'
    }

    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            try:
                html = HTML(html=r.json()['items_html'],
                            url='bunk',
                            default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{user}" does not exist or is private.')

Exemplo n.º 33

0

Exibir arquivo

Arquivo: views.py Projeto: roasteRKopsus/Scrap_Show

def new_search(request):
    search = request.POST.get('search')
    models.Search.objects.create(search=search)
    final_url = BASE_URL.format(quote_plus(search))
    final_url2 = BASE_URL2.format(requote_uri(search))
    print(final_url)
    print(final_url2)

    response = requests.get(final_url)
    headers = requests.utils.default_headers()
    headers.update({'user-agent': 'GoogleChrome'})

    response2 = requests.get(final_url2.replace(u'\ufeff', ''))

    headers = requests.utils.default_headers()
    headers.update({'user-agent': 'GoogleChrome'})


    session = HTMLSession()
    resp = session.get(final_url2)
    print(resp)
    soup4 = BeautifulSoup(resp.html.html, "lxml")
    #image = soup4.find_all(class_ = 'BaseCardstyle__ListingPosterMainWrapper-LdsSD KSRNa' )
    image2 = soup4.find_all(class_= 'img-wrapper')
    for gambar in image2:
        link_gambar = gambar.find(class_ = 'Rumah Cimanggis Pinggir Jalan Dekat Pintu Tol di Depok, Cimanggis, Depok 1')
        print(link_gambar)





    final_post = []
    final_post2 = []
    print(final_post2)


    data = response.text
    soup = BeautifulSoup(data, features='lxml')
    post_listing = soup.find_all(class_='card ListingCell-content js-MainListings-container ListingCell-wrapper')

    data2 = response2.text

    soup2 = BeautifulSoup(data2,features='lxml')
    post_listing2 = soup2.find_all(class_= 'BaseCardstyle__ListingContainer-pryVa gCOzDl')










    print('*' * 100)

    for post in post_listing:
        post_title = post.find(class_='ListingCell-KeyInfo-title').text
        post_title2 = post_title.strip()


        post_link = post.find('a').get('href')

        if post.find(class_='PriceSection-FirstPrice'):
            post_price = post.find(class_='PriceSection-FirstPrice').text
        else:
            post_price = 'N/A'

        if post.find(class_='ListingCell-image'):
            post_image_id = post.find(class_='ListingCell-image').img['data-src']
            #post_image_url = BASE_IMAGE_URL.format(post_image_id)
        else:
            post_image_id = 'https://craigslist.org/images/peace.jpg'

        final_post.append((post_title2, post_price, post_link, post_image_id))

    #part rumah123.com

    for postrumah123 in post_listing2:
        post_titlerumah123 = postrumah123.find(class_='BaseCardstyle__ListingTitleWrapper-bFjnJr hTMjgq')
        if post_titlerumah123 is not None:
            post_titlerumah123 = post_titlerumah123.text
        else:
            post_titlerumah123 = 'rumah dijual di daerah strategis'


        post_linkrumah123 = postrumah123.find('a').get('href')

        if '.com' not in post_linkrumah123:
            post_linkrumah123_revisi = urljoin(PARENT_URL2, post_linkrumah123)
        else:
            post_linkrumah123_revisi = post_linkrumah123


        if postrumah123.find(class_='listing-primary-price ListingPrice__Wrapper-FYsEL cpaEEX'):
            post_pricerumah123 = postrumah123.find(class_='listing-primary-price-item ListingPrice__ItemWrapper-egelzL fnIFZc').get_text()
        else:
            post_price = 'N/A'


        final_post2.append((post_titlerumah123, post_pricerumah123, post_linkrumah123_revisi,))

        #else:
         #   post_image_idrumah123 = 'https://craigslist.org/images/peace.jpg'














        #print(post_pricerumah123)



        #final_post2.append((post_titlerumah123_2, post_pricerumah123, post_linkrumah123))


    for_frontend = {
        'search': search,
        'final_post': final_post,
        'final_post2': final_post2



    }

    return render(request, 'apaan/new_search.html', for_frontend)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: crawler-with-db.py Projeto: yiyc-kr/crawl-with-db

    def search_data(self, crawl_rules, arguments):
        if crawl_rules['method'] == 'post':
            crawl_rules = self.parse_rules(crawl_rules, arguments['keyword'], 'form_data')

            crawl_rules['form_data'] = json.loads(crawl_rules['form_data'])

            res = requests.post(crawl_rules['request_url'], data=crawl_rules['form_data'])

            try:
                content = json.loads(res.text)
            except json.decoder.JSONDecodeError:
                print("Check your keyword and parameter")
                print(res.text)
                return 0

            if crawl_rules['result_list_param'] is not None:
                last_page = int(content[crawl_rules['result_list_param']][0][crawl_rules['result_total_page_param']])
                result_data = []
                for line in content[crawl_rules['result_list_param']]:
                    result_data.append(
                        str(line[crawl_rules['result_code_param']]) + " " + str(line[crawl_rules['result_name_param']]))
                for i in range(2, last_page + 1):
                    crawl_rules['form_data'][crawl_rules['result_current_page_param']] = i
                    res = requests.post(crawl_rules['request_url'], data=crawl_rules['form_data'])
                    if type(res.content) is bytes:
                        content = json.loads(res.content.decode('utf8'))

                    for line in content[crawl_rules['result_list_param']]:
                        result_data.append(
                            str(line[crawl_rules['result_code_param']]) + " " + str(
                                line[crawl_rules['result_name_param']]))
        elif crawl_rules['method'] == 'get':
            crawl_rules = self.parse_rules(crawl_rules, arguments['keyword'], 'request_url')

            sess = HTMLSession()

            res = sess.get(crawl_rules['request_url'])

            try:
                res.html.render()
            except MaxRetries:
                print("MaxRetries...")
                print('Want you reload?')
                ans = input('(Y/N) << ').lower()
                if ans in ['yes', 'y']:
                    self.get_get_data(crawl_rules, arguments)
                elif ans in ['no', 'n']:
                    return 0

            soup = BeautifulSoup(res.html.html, 'lxml')

            if arguments['stage'] == "select":
                result_data = ''
                if crawl_rules['label_css_path'] is not None:
                    result_data += re.sub("[\n]", " ", soup.select(crawl_rules['label_css_path'])[0].text) + ": "
                result_data += re.sub("[^\d\.%]", "", soup.select(crawl_rules['value_css_path'])[0].text)
                # result_data = soup.select(crawl_rules['value_css_path'])[0].text
            else:
                result_data = soup.select(crawl_rules['value_css_path'])[0].text

        return result_data

Exemplo n.º 35

0

Exibir arquivo

from bs4 import BeautifulSoup
from requests_html import HTMLSession
import csv

#urls that will be parsed
urls = [
    'https://iaclarington.com/en/price-performance/fund?fund_id=4201&series=4401',
    'https://iaclarington.com/en/price-performance/fund?fund_id=4303&series=7400',
    'https://iaclarington.com/en/price-performance/fund?fund_id=4509',
    'https://iaclarington.com/en/price-performance/fund?fund_id=4215'
]

session = HTMLSession()

# handling the csv file
csv_file = open('performance.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow([
    "Name", "Value", "Performance Date", "Performance (1mo)",
    "Performance (3mo)"
])

#iterating through the url list
for url in urls:
    print(url)
    response = session.get(url)
    response.html.render()

    soup = BeautifulSoup(response.html.html, 'lxml')

    main = soup.find('div', class_='main-content')

Exemplo n.º 36

0

Exibir arquivo

Arquivo: scrape_ships.py Projeto: ranjancse26/02ML

import numpy as np
import os.path
import cv2
import pandas as pd
import time
import json
import sys
from requests_html import HTMLSession
s = HTMLSession()
import time

z = int(sys.argv[1])

if not os.path.exists("all_ships.zoom=%s" % z):
    os.mkdir("all_ships.zoom=%s" % z)


# A function to convert values from string to the most suitable format for that valuu
# - str, int or float
def mangle_type(val):
    try:
        if str(int(val)) == val.strip(): return int(val)
    except:
        pass
    try:
        if str(float(val)) == val.strip(): return float(val)
    except:
        pass
    return val

Exemplo n.º 37

0

Exibir arquivo

Arquivo: stats.py Projeto: chenhang/chenhang.github.io

def parse_team(team_name, team_url):
    session = HTMLSession()
    return {'id': team_url.split('id=')[-1], 'name': team_name, 'heros': [{'win_rate': div.text, 'heros': [span.attrs['title'].replace('soldier76', 'soldier-76') for span in div.find('span')]}
                                                                          for div in session.get(team_url).html.find('.team-comp-wrapper > .team-comp')]}

Exemplo n.º 38

0

Exibir arquivo

 def __init__(self):
     """ INITIALISATION
     """
     self.session = HTMLSession() # Notre "navigateur virtuel"

Exemplo n.º 39

0

Exibir arquivo

def parse_single_url(url, region):
    with HTMLSession() as session:
        response = session.get(url=url)
        response = response.html

    try:
        result = re.findall(
            r'\"location\":{\"latitude\":\d{2}[.]\d{0,7},\"longtitude\":\d{2}[.]\d{0,7}',
            response.xpath("/html/body/script[19]/text()")[0])[0]

        latitude = re.findall(r'\"latitude\":\d{2}[.]\d{0,7}',
                              result)[0].split(":")[1]
        longtitude = re.findall(r'\"longtitude\":\d{2}[.]\d{0,7}',
                                result)[0].split(":")[1]
    except BaseException:
        latitude = "Null"
        longtitude = "Null"

    area = "No information"
    floor = "No information"

    try:
        for value in response.xpath(
                "//li[@class='card-living-content-params-list__item']"):
            text = value.text
            if "площадь" in text:
                area = text.split('площадь')[1].split()[0]
            elif "Этаж" in text:
                floor = text.split('Этажей')[1]
    except BaseException:
        pass

    try:
        who = response.xpath("//div[@class='offer-card-contacts__person _type']")[0].text + ", " + \
              response.xpath("//div[@class='offer-card-contacts__person']//span[@class='ui-kit-link__inner']")[0].text
    except BaseException:
        who = "No information"

    try:
        price = response.xpath("//div[@class='price']")[0].text.split(
            "₽")[0].replace(" ", "")
    except BaseException:
        price = "No information"

    try:
        link = \
            response.xpath("//a[@class='ui-kit-link offer-card-contacts__owner-name _type-common _color-blue']/@href")[
                0]
    except IndexError:
        host = url.split(".ru/")[0] + ".ru"
        link = host + response.xpath(
            "//a[@class='ui-kit-link offer-card-contacts__link _agency-name _type-common _color-blue']/@href"
        )[0]
    try:
        contacts = response.xpath(
            "//a[@class='offer-card-contacts-phones__phone']/@href")[0].split(
                "+")[1]
    except BaseException:
        contacts = "No information"

    info = {
        "Регион": region,
        "Ссылка на объявление": url,
        "Продажа/Аренда": "Продажа",
        "Тип": "Коттеджи",
        "Широта/Долгота": latitude + ", " + longtitude,
        "Площадь": area,
        "Этаж": floor,
        "Цена": price,
        "название агенства недвижимости": who,
        "Ссылка на агенство": link,
        "Телефон агенства": contacts,
        "Дата скачивания": time,
    }

    with locker:
        with open(write_to, mode='a') as write_file:
            fieldnames = [
                "Регион", "Ссылка на объявление", "Продажа/Аренда", "Тип",
                "Широта/Долгота", "Площадь", "Этаж", "Цена",
                "название агенства недвижимости", "Ссылка на агенство",
                "Телефон агенства", "Дата скачивания"
            ]

            writer = csv.DictWriter(write_file,
                                    delimiter=';',
                                    fieldnames=fieldnames)
            writer.writerow(info)

Exemplo n.º 40

0

Exibir arquivo

Arquivo: parser.py Projeto: AlexeyRadchenko/LostFilmBot

class LostFilmParser:
    source_url = 'https://www.lostfilm.tv/'
    tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php'
    part_step = 10

    def __init__(self):
        self.session = HTMLSession()
        self.news_data = self.session.get(self.source_url)

    def get_links(self):
        return self.news_data.html.links

    def get_title_en(self, href):
        try:
            result = search(r'/series/([^/]+)/', href)
            title_en = result.group(1)
            tv_show_link = self.source_url.rstrip('/') + result.group()
        except AttributeError:
            title_en = None
            tv_show_link = None
        return title_en, tv_show_link

    def get_new_shows_episodes(self):
        clear_data = []
        news_block = self.news_data.html.find('.new-movies-block', first=True)
        movies = news_block.find('a.new-movie')
        for movie in movies:
            title_en, show_link = self.get_title_en(movie.attrs['href'])
            clear_data.append(
                {
                    'title_ru': movie.attrs['title'],
                    'title_en': title_en,
                    'jpg': 'http:' + movie.find('img', first=True).attrs['src'],
                    'season': movie.find('.title', first=True).text,
                    'date': movie.find('.date', first=True).text,
                    'episode_link': self.source_url.rstrip('/') + movie.attrs['href'],
                    'tv_show_link': show_link,
                }
            )
        return clear_data

    def load_part_list(self, step):
        url = self.source_url + 'ajaxik.php'
        request_data = self.session.post(
            url=url,
            data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'}
            )
        return json.loads(request_data.content)['data']

    def get_tv_shows_list(self):
        """10->20->30-> пока не вернет пустой список"""
        step = 0
        shows_list = []
        request_result = self.load_part_list(step)
        while request_result:
            for result in request_result:
                shows_list.append(result)
            step += self.part_step
            sleep(1)
            request_result = self.load_part_list(step)
        return shows_list

Exemplo n.º 41

0

Exibir arquivo

 def __init__(self):
     self.session = HTMLSession()

Exemplo n.º 42

0

Exibir arquivo

class Scraper:

    # Initializes the scraper C3PO
    def __init__(self, url, budget, u_email):

        # Attributes about product
        self.url = url
        self.budget = budget

        # Setting user email
        self.u_email = u_email

        # Attributes about scraping
        self.session = HTMLSession()
        self.webpage = self.session.get(self.url).content
        self.parser = 'lxml'
        self.soup = BeautifulSoup(self.webpage, self.parser)

    # Prints the object
    def __str__(self):
        return self.soup.prettify()

    # Stores the title of the product
    def get_title(self):
        try:
            temp_title = self.soup.find('span', id='productTitle').text.strip()
            temp_list_title = []
            for x in temp_title:
                if x == '(':
                    break
                temp_list_title.append(x)
            self.product_title = ''.join(temp_list_title)
            return self.product_title
        except Exception:
            print("\n")
            print("ERROR - We weren't able to find the name of the product")
            print("\n")
            print("Exiting the script")
            exit()

    # Stores the price of the product after filtering the string and
    # converting it to an integer
    def get_price(self):
        price_raw = self.soup.find('span',
                                   id='priceblock_ourprice').text.strip()
        price_filtered = price_raw[2:len(price_raw) - 3]
        self.product_price = int(''.join(
            [x for x in price_filtered if x != ',']))
        return

    # Prints product title
    def print_title(self):
        print(self.product_title)
        return

    # Prints product price
    def print_price(self):
        print(self.product_price)
        return

    # Checks if the price of the product is below the budget
    def is_below_budget(self):
        if self.product_price <= self.budget:
            return True
        else:
            return False

    # Runs the scraper
    def run(self):

        self.get_title()
        self.get_price()
        self.alert = self.is_below_budget()
        self.status = False
        if self.alert:
            self.status = self.send_email()
        return self.status

    # Sends an email when the condition is satisfied. Under testing!
    def send_email(self):

        # Attributes for email sending
        port = 587
        smtp_server = 'smtp.gmail.com'
        self.email = str(os.environ.get('DEVELOPER_MAIL'))
        self.app_pw = str(os.environ.get('DEVELOPER_PASS'))

        # Message details
        subject = f'The price of {self.get_title()} is within your budget!'

        body_start = """Hey there!\n
        The price is now within your budget. Here is the link, buy it now!\n"""
        body_mid = self.url
        body_end = '\n\nRegards\nYour friendly neighbourhood programmer'
        body = str(body_start) + str(body_mid) + str(body_end)

        message = f"Subject: {subject}\n\n{body}"

        # Establishing server
        context = ssl.create_default_context()
        self.server = smtplib.SMTP(smtp_server, port)

        # Mail sending
        self.server.ehlo()
        self.server.starttls(context=context)
        self.server.ehlo()
        self.server.login(self.email, self.app_pw)

        self.server.sendmail(self.email, self.u_email, message)

        print("Email sent successfully!")
        self.server.quit()
        return True

Exemplo n.º 43

0

Exibir arquivo

Arquivo: voda_gov_cz.py Projeto: jirikadlec2/hydrodata

def fetch_pmo_charts(dst_dir, agency, base_url, subpages, datatype_prefix):
    """
    Fetch graphs and html tables from pmo (Povodi Moravy) water board
    fetch_pmo_charts(dst_dir='/home/jiri/meteodata',
                         agency_prefix='pmo',
                         base_url='http://www.pmo.cz/portal/srazky/en/',
                         subpages=['prehled_tab_1_chp.htm', 'prehled_tab_2_chp.htm', 'prehled_tab_3_chp.htm'],
                         datatype_prefix='precip',
                         agency='pmo')

    :param dst_dir: destination directory where to save the data (subdirs are created automatically)
    :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow,
                                               http://www.pvl.cz/portal/srazky/pc/? for precipitation]
    :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3'])
    :param datatype_prefix: the data type. use 'streamflow' or 'precip'
    :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo
    :return: number of charts and html pages downloaded
    """

    agency = "pmo"

    session = HTMLSession()
    n_charts = 0

    for subpage in subpages:
        url = base_url + subpage
        print('-----------------------------')
        print(url)
        print('-----------------------------')
        r = session.get(url)

        anchors = r.html.find('a')
        a_hrefs = [a for a in r.html.find('a') if "DoMereni" in a.attrs["href"]]
        for a in a_hrefs:
            id = a.attrs["href"].split("'")[1]
            url_html = '{:s}/en/mereni_{:s}.htm'.format(base_url, id)
            print(url_html)

            
            if datatype_prefix == 'precip':
                url_img = '{:s}/grafy/sr{:s}_en.gif'.format(base_url, id)
            else:
                url_img = '{:s}/grafy/{:s}.gif'.format(base_url, id)
            print(url_img)
            img_response = get(url_img)
            if img_response.status_code == 200:
                img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(url_img))[0])
                if not os.path.exists(img_dir):
                    os.makedirs(img_dir)
                utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.gif')
                img_filename = os.path.basename(url_img).replace('.gif', utc_timestamp_text)

                img_path = os.path.join(img_dir, img_filename)
                print(img_path)
                with open(img_path, 'wb') as f:
                    f.write(img_response.content)
                    n_charts += 1

                # also save the HTML
                html_path = img_path.replace('.gif', '.htm')
                html_response = get(url_html)
                if html_response.status_code == 200:
                    print(html_path)
                    with open(html_path, 'wb') as f:
                        f.write(html_response.content)
    return n_charts

Exemplo n.º 44

0

Exibir arquivo

from requests_html import HTMLSession
import time
import sys

session = HTMLSession()

pages = 1

nxt = "https://old.reddit.com/r/ShrugLifeSyndicate/"

follow_posts = True

while pages <= 10:
    r = session.get(nxt)

    articles = r.html.find("article")

    titles = r.html.find(".title")

    for t in titles:
        title = t.text
        url = t.attrs.get("href")
        if url:
            print(t.text)

            if (follow_posts):
                r = session.get("https://old.reddit.com/" + url)
                body = r.html.find(".usertext-body")
                # the first usertext-body is the sidebar
                if len(body) > 1:
                    print(body[1].text)

Exemplo n.º 45

0

Exibir arquivo

Arquivo: requests_scrap.py Projeto: jjcc/PythonUtilities

from requests_html import HTMLSession
session = HTMLSession()

r = session.get('https://reddit.com') 

for html in r.html:
    print(html)

pass

Exemplo n.º 46

0

Exibir arquivo

Arquivo: load_firm.py Projeto: shpeeck/wezom

    def handle(self, *args, **options):
        print('Start')
        Enterprises.objects.all().delete()
        session = HTMLSession()
        resp = session.get('http://west-info.ua/katalog-predpriyatij/')
        a = 1
        start_urls = []
        while True:
            b = 1
            try:
                links = resp.html.xpath(
                    f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/a/text()'
                )
                print('Категория: {}'.format(links[0]))
                while True:
                    try:
                        link = resp.html.xpath(
                            f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/ul/li[{b}]/a'
                        )
                        print('Подкатегория -- {}'.format(link[0]))
                        url = str(link[0]).split("href=")[1].rstrip(
                            '\'>').lstrip("\'")
                        dom = "http://west-info.ua"
                        full_url = f"{dom}{url}"
                        cat = resp.html.xpath(
                            f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/a/text()'
                        )
                        pod = resp.html.xpath(
                            f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/ul/li[{b}]/a/text()'
                        )
                        page = 1
                        page_one = 1
                        while True:
                            try:
                                max_page = []
                                i = 20
                                while i >= 0:
                                    full_url_page = f"{full_url}?page={page_one}"
                                    session = HTMLSession()
                                    resp = session.get(full_url_page)
                                    tit = resp.html.xpath(
                                        '/html/body/main/div/section/div/ul/li[5]/a/text()'
                                    )
                                    max_page.append(tit)
                                    i -= 1
                                    page_one += 1
                                max_page = int(str(max_page[-1]).strip("[']"))
                                if page <= max_page:
                                    full_url_page = f"{full_url}?page={page}"
                                    session = HTMLSession()
                                    resp = session.get(full_url_page)
                                    print(full_url_page)
                                    try:
                                        code = resp.text
                                        company_url = re.findall(
                                            r'<a href="/company/.+', code)
                                        for i in company_url:
                                            url = i.lstrip(
                                                '<a href="').rstrip()
                                            url = url.split("/")
                                            url = f'{dom}/{url[1]}/{url[2]}/'
                                            print(url)
                                            try:
                                                session = HTMLSession()
                                                resp = session.get(url)
                                                code = resp.text
                                                company_name = re.findall(
                                                    r'<h1 class="main-ttl"><span>.+',
                                                    code)
                                                company_name = company_name[
                                                    0].split('span')
                                                company_name = company_name[1]
                                                company_name = company_name.strip(
                                                    '></')
                                                print('Название: ',
                                                      company_name)
                                                city = re.findall(
                                                    r'<b>Город:</b>.+', code)
                                                city = city[0].split('<')
                                                city = city[2]
                                                city = city.split('>')
                                                city = city[1]
                                                print('Город: ', city)
                                                phone = re.findall(
                                                    r'<b>Телефон</b> .+', code)
                                                phone = phone[0].split(
                                                    '<b>Телефон</b> ')
                                                phone = phone[1]
                                                phone = phone.split('</')
                                                phone = phone[0]
                                                print('Телефоны: ', phone)
                                                content = re.findall(
                                                    r'<p>.+', code)
                                                content = content[0]
                                                content = content.split('p>')
                                                content = content[1]
                                                content = content.split('<')
                                                content = content[0]
                                                print('Контент', content)
                                                address = re.findall(
                                                    r'Адрес:</b> .+', code)
                                                address = address[0].split(
                                                    'Адрес:</b> ')
                                                address = address[1]
                                                address = address.split('</')
                                                address = address[0]
                                                print('Адрес: ', address)
                                                image = re.findall(
                                                    r'/admin/uploads/products/images/.+',
                                                    code)
                                                image = image[0]
                                                image = image.split('"')
                                                image = image[0]
                                                image = f'{dom}{image}'

                                                print('Изображение: ', image)
                                                subcat = Subcategory.objects.all(
                                                )
                                                firm = Enterprises()
                                                firm.name = company_name
                                                firm.content = content
                                                firm.image = image
                                                firm.phone = phone
                                                firm.address = address
                                                for i in subcat:
                                                    if i.name == pod[0]:
                                                        firm.subcategory = i
                                                # firm.category = cat
                                                firm.save()
                                            except:
                                                break
                                    except:
                                        break
                                    page += 1
                                else:
                                    break
                            except:
                                break
                        b += 1
                    except:
                        break
                a += 1
            except:
                break
        print(start_urls)

Exemplo n.º 47

0

Exibir arquivo

class Switter:
    def __init__(self, *, proxies: Optional[Dict[str, str]] = None):
        self._session = HTMLSession()
        self._session.headers.update({'User-Agent': _CHROME_USER_AGENT})

        if proxies:
            self._session.proxies.update(proxies)

        self._enable_legacy_site()

    def _enable_legacy_site(self):
        self._session.cookies.set('m5', 'off')

    def _profile_html(self, screen_name: str) -> HTML:
        url = f'https://twitter.com/{screen_name}'
        response = self._session.get(url)
        response.raise_for_status()
        return response.html

    def _search_json(self,
                     query: str,
                     max_position: Optional[int] = None) -> dict:
        url = 'https://twitter.com/i/search/timeline'
        response = self._session.get(url,
                                     params={
                                         'q': query,
                                         'f': 'tweets',
                                         'max_position': max_position or -1
                                     })
        response.raise_for_status()
        return response.json()

    def profile(self, screen_name: str) -> dict:
        document = self._profile_html(screen_name)
        data = json.loads(
            html.unescape(
                document.find('input.json-data[id=init-data][type=hidden]',
                              first=True).attrs['value']))
        user = data['profile_user']
        date_format = r'%a %b %d %H:%M:%S %z %Y'

        return dict(
            id=user['id'],
            name=user['name'],
            screen_name=user['screen_name'],
            location=user['location'],
            website=user['url'],
            description=user['description'],
            created_at=datetime.datetime.strptime(user['created_at'],
                                                  date_format),
            following_count=user['friends_count'],
            followers_count=user['followers_count'],
            favorites_count=user['favourites_count'],
            tweets_count=user['statuses_count'],
            private=user['protected'],
        )

    def followers(self, screen_name: str) -> Iterable[str]:
        cursor: Optional[int] = INITIAL_CURSOR
        while cursor is not None:
            screen_names, cursor = self.followers_page(screen_name, cursor)
            yield from screen_names

    def followers_page(
            self,
            screen_name: str,
            cursor: int = INITIAL_CURSOR) -> Tuple[List[str], Optional[int]]:
        response = self._session.get(
            f'https://mobile.twitter.com/{screen_name}/followers',
            params={'cursor': cursor} if cursor != INITIAL_CURSOR else None,
        )
        response.raise_for_status()

        document = response.html

        screen_names = _parse_followers_screen_names(document)
        next_cursor = _parse_followers_cursor(document)

        return screen_names, next_cursor

    def search(self, query: str, *, limit=20) -> Iterable[dict]:
        assert limit > 0

        count = 0
        position = -1

        while True:
            data = self._search_json(query, max_position=position)
            html = HTML(html=data['items_html'])
            tweets = _extract_tweets(html)

            yield from map(_parse_tweet, tweets[:limit - count])
            count += len(tweets)

            if not data['has_more_items'] or count >= limit:
                break

            position = data['min_position']

Exemplo n.º 48

0

Exibir arquivo

Arquivo: old.py Projeto: klongmore/streamnba

                                 9:streaminfo.find('target="') - 2])

display.stop()

print("\nOpening top available stream...")

streamopen = False
for s in streamurls:
    if "buffstreamz.com" in s:
        os.system("xdg-open " + s + " &> /dev/null")
        streamopen = True
    elif "ripple.is" in s:
        os.system("xdg-open " + s + " &> /dev/null")
        streamopen = True
    elif "nbastreams.xyz" in s:
        for line in HTMLSession().get(s).text.split():
            if 'src="http://crackstreams.ga/nba/' in line:
                for pline in (HTMLSession().get(
                        line[line.find('src="http://crackstreams.ga/nba/') +
                             5:-1]).text.split()):
                    if '"http://' in pline:
                        os.system("nohup mpv " + pline[1:-2] + "&> /dev/null")
                        exit()

if not streamopen:
    os.system("xdg-open " + streamurls[0] + " &> /dev/null")

if input("Does this stream work? (y/n): ").lower() == "n":
    with open("streams.txt", "w+") as f:
        for stream in streamurls:
            f.write(stream + "\n")

Exemplo n.º 49

0

Exibir arquivo

Arquivo: crawl_campus_mz.py Projeto: wahlmzr/craw

class XHSpider(Process):
    def __init__(self,url):
        # 重写父类的__init__方法
        super(XHSpider, self).__init__()
        self.url = url

        self.session = HTMLSession()
        self.headers = {
            'Host':'news.daxues.cn',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        self.path = "D:/Photo/"
        self.check_file_path(self.path)

    def check_file_path(self, path):
        '''
        check the file is exists
        '''
        if not os.path.exists(path):
            os.makedirs(path)

    def run(self):
        self.parse_page()

    def send_request(self, url):
        '''
        用来发送请求的方法
        '''
        # 请求出错时，重复求情3次
        i = 0
        while i < 3:
            try:
                print('请求url : ', url)
                # 网页是utf-8编码
                return self.session.get(url, headers = self.headers).html
            except Exception as e:
                print('send_request error : ', str(e))
                i += 1

    def parse_page(self):
        '''
        解析网站源码,使用request-html提取
        '''
        html = self.send_request(self.url)
        imgs = html.find('dl a.p img')
        for img in imgs:
            href = img.attrs['src']
            alt = img.attrs['alt']
            self.save_image('http://news.daxues.cn'+href, alt)


    def save_image(self, url, name):
        '''
        save image
        '''
        content = self.session.get(url, headers=self.headers).content
        with open(self.path+name+'.jpg', 'wb') as f:
            f.write(content)
            f.close()
    def parse(self, url):
        self.url = url
        self.parse_page()

Exemplo n.º 50

0

Exibir arquivo

 def __init__(self,**kw):
     self.session = HTMLSession()

Exemplo n.º 51

0

Exibir arquivo

Arquivo: jobs.py Projeto: thp/urlwatch

 def retrieve(self, job_state):
     from requests_html import HTMLSession
     session = HTMLSession()
     response = session.get(self.navigate)
     return response.html.html

Exemplo n.º 52

0

Exibir arquivo

Arquivo: soup.py Projeto: gadzygadz/Networking_Presentation

from requests_html import HTMLSession
from bs4 import BeautifulSoup

session = HTMLSession()
resp = session.get("https://www.desales.edu/directory")
resp.html.render() # forces js to run

soup = BeautifulSoup(resp.html.html, "lxml")

emails = []

for td in soup.find_all('td'):
    if "@desales.edu" in td.text:
        emails.append(td.text)
    
print(emails)

Exemplo n.º 53

0

Exibir arquivo

Arquivo: dz7_google_parser.py Projeto: vertinskiy-oleg/python-for-seo

import random
from time import sleep
from pprint import pprint
from requests_html import HTMLSession

keywords = ('buy essays online', 'buy essay', 'write my essay',
            'write history essay')

session = HTMLSession()

SERP = {}

for key in keywords:
    print(f'Send request to Google: [{key}]')
    resp = session.get(f'https://www.google.com/search?q={key}&num=100&hl=en')
    links = resp.html.xpath('//div[@class="r"]/a[1]/@href')
    SERP[key] = [x.split('/')[2] for x in links if 'http' in x]
    sleep_seconds = random.randint(1, 10)
    print(f'Sleep: {sleep_seconds}')
    sleep(sleep_seconds)

pprint(SERP)

#Ваш код писать тут...

domains = [set(l) for l in SERP.values()]

all_domains = sorted(set.union(*domains))
int_domains = sorted(set.intersection(*domains))

print(f'Unique and sorted domains:\n {all_domains}')

Exemplo n.º 54

0

Exibir arquivo

from requests_html import HTMLSession
import pandas as pd
lista = []
#url='https://www.coppel.com/refrigeradores-y-congeladores'
s = HTMLSession()


def request(url):
    res = s.get(url)
    res.html.render(sleep=1)

    print(res.status_code)
    #print(res.html.xpath('//*[@id="searchBasedNavigation_widget_6_2303"]/div[1]/div[2]', first= True))
    return res.html.xpath(
        '//*[@id="searchBasedNavigation_widget_6_2303"]/div[1]/div[2]',
        first=True)


def parse(products):
    for product in products.absolute_links:
        try:
            res = s.get(product)
            name = res.html.find('div.top.namePartPriceContainer.clearfix',
                                 first=True).text
        except:
            print("No se encontro el producto!!")

        if res.html.find('div.p_oferta'):
            price = res.html.find('div.tam_normal', first=True).text.replace(
                "de contado", "").replace('&nbsp;', '')
            oferta = "Producto en oferta"

Exemplo n.º 55

0

Exibir arquivo

Arquivo: scrape.py Projeto: hieuvocong/web-crawler

import csv

from requests_html import HTMLSession

session = HTMLSession()

URL = "http://vbpl.vn/TW/Pages/vanban.aspx?fromyear=01/01/2011&toyear=31/12/2020&dvid=13&Page="
MAX_PAGES = 875

with open("list.csv", mode="w", encoding="utf-8-sig", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(
        ["sokyhieu", "trichyeu", "ngaybanhanh", "ngayhieuluc", "word"])

    for page in range(1, MAX_PAGES + 1):
        try:
            r = session.get(f"{URL}{page}")
        except Exception as e:
            r = None

        if r:
            items = r.html.find(".item")
            for item in items:
                try:
                    title = item.find(".title", first=True).text
                    description = item.find(".des", first=True).text
                    right_column = item.find(".green")
                    publish_date = right_column[0].text[-10:]
                    valid_date = right_column[1].text[-10:]
                    doc_file = item.find(".fileAttack a", first=True)

Exemplo n.º 56

0

Exibir arquivo

Arquivo: scraper.py Projeto: xc2303/WSJ-Short-Interest-Scraper

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import pandas as pd
from requests_html import HTMLSession
import time
from openpyxl import load_workbook

session = HTMLSession()
lst2 = []

url2 = 'http://www.wsj.com/mdc/public/page/2_3062-shtnyse_0_9-listing.html'
r2 = session.get(url2)
#print(r2)
hd = r2.html.find('.colhead')

for x in range(len(hd)):
    if '\n' in hd[x].text:
        s = hd[x].text.split("\n")
        lst2.append(' '.join(s))
    else:
        lst2.append(hd[x].text)

time.sleep(.5)

headers = lst2
exchange = ['shtnyse_', 'shtnasdaq_', 'shtamex_']
sheet = ['NYSE', 'NASDAQ', 'AMEX']

Exemplo n.º 57

0

Exibir arquivo

Arquivo: lianjia_city_district.py Projeto: jumper2014/python-code-works

#!/usr/bin/env python
# coding=utf-8
# author: zengyuetian
# 获得链家网城区信息


from requests_html import HTMLSession

if __name__ == '__main__':
    session = HTMLSession()
    # 获取上海链家小区栏目页
    r = session.get('https://sh.lianjia.com/xiaoqu/')
    # 获得上海链家区县列表
    elements = r.html.xpath('/html/body/div[3]/div[1]/dl[2]/dd/div/div/a')
    # 区县英文和中文名列表
    en_names = list()
    ch_names = list()

    # element html代码 形如 <a href="/xiaoqu/pudong/" title="上海浦东小区二手房 ">浦东</a>
    for element in elements:
        for link in element.absolute_links:  # 遍历链接set
            en_names.append(link.split('/')[-2])
            ch_names.append(element.text)

    # 打印区县英文和中文名列表
    for index, name in enumerate(en_names):
        print(name, ch_names[index])

    """
        pudong 浦东
        minhang 闵行

Exemplo n.º 58

0

Exibir arquivo

Arquivo: parser.py Projeto: AlexeyRadchenko/LostFilmBot

 def __init__(self):
     self.session = HTMLSession()
     self.news_data = self.session.get(self.source_url)