Exemplos de get_soup em Python, exemplos de util.get_soup em Python

Exemplo n.º 1

0

Exibir arquivo

def get_live_log_map(search, url=None):
    full_search = ' '.join(search)
    if url:
        soup = get_soup(url)
    else:
        last_name = format_live_search(search)
        soup = get_soup(espn_search_url.format(search=last_name))
    name_tag = soup.find('meta', attrs={'property': 'og:title'})
    if name_tag:
        try:
            name = name_tag.get('content').replace(' Stats, News, Bio | ESPN', '')
            is_playing = soup.find('h3', class_='Card__Header__Title Card__Header__Title--no-theme', text='Current Game')
            just_played = soup.find('h3', class_='Card__Header__Title Card__Header__Title--no-theme', text='Previous Game')
            has_stats = soup.findChildren('div', class_='StatBlockInner ph2 flex-expand')
            if (is_playing or just_played) and has_stats:
                log_map = {}
                game_summary = soup.findChild('a', attrs={'title': 'Game Summary'})
                stats_table = game_summary.find_next('tbody', class_='Table__TBODY')
                stats = [row.text for row in stats_table.findChildren(lambda tag: tag.name == 'td')]
                log_map['mp'] = stats[2]
                log_map['fg_pct'] = stats[3]
                log_map['tp_pct'] = stats[4]
                log_map['ft_pct'] = stats[5]
                log_map['trb'] = int(float(stats[6]))
                log_map['ast'] = int(float(stats[7]))
                log_map['blk'] = int(float(stats[8]))
                log_map['stl'] = int(float(stats[9]))
                log_map['pf'] = int(float(stats[10]))
                log_map['tov'] = int(float(stats[11]))
                log_map['pts'] = int(float(stats[12]))
                log_map['pm'] = has_stats[-1].text
                log_map['name'] = name
                return just_played is not None, log_map
            else:
                raise NoResultsError(f"Either {name} isn't currently playing or ESPN's site is lying to me")
        except Exception as ex:
            raise ex
    else:
        results_table = soup.find('div', attrs={'id': 'my-players-table'}).find_next('table')
        col_header = results_table.findChild('tr', class_='colhead')
        if col_header:
            player_results = results_table.findChildren(lambda tag: tag.name == 'tr' and tag.get('class') not in ['stathead', 'colhead'])
            result_map = {}
            for result in player_results:
                a = result.find_next('a')
                name = a.text.split(', ')
                name = f'{name[1]} {name[0]}'
                match = SequenceMatcher(None, full_search, name).ratio()
                result_map[a.get('href')] = match
            player_href = sorted(result_map, key=result_map.get, reverse=True)[0]
            return get_live_log_map(search, player_href)
        else:
            raise NoResultsError(f"No results for '{full_search}'")

Exemplo n.º 2

0

Exibir arquivo

Arquivo: votes.py Projeto: Empact/fiftystates

def get_bill_pages(scraper, url=None,doc_types=None):
    if url is None: url = legislation_url()
    """Return a sequence of tuples by retrieving all the documents described in the given url (representing
        a specific GA and session.)  Optionally filter the sequence to only the given document types ('house bill',
        'senate bill', etc.).  Each tuple returned will be in the form:
            (bill_id,short_name,status_url)
    """
    s = get_soup(scraper, url)
    links = s("a", { "href": lambda x: x is not None and x.find("grplist.asp") != -1 })
    links = map(lambda x: x['href'], links)
    d = {}
    for link in links:
        types = re.findall("DocTypeID=(.+?)&",link)
        for t in types:
            d.setdefault(t,[]).append(urljoin(url,link))

    pages = []
    if not doc_types:
        doc_types = ['HB','SB'] # sane default
    for type in doc_types:
        if d.has_key(type):
            simplified_url = min_max(d[type])
            pages.extend(extract_bill_links(scraper, simplified_url))
    
    return pages

Exemplo n.º 3

0

Exibir arquivo

Arquivo: votes.py Projeto: Empact/fiftystates

def extract_vote_pdf_links(scraper, url,chamber_filter=None):
    """Given a URL to a "votehistory.asp" page, return a sequence of tuples, each of which 
       has the form (chamber,label,url)
       
       It's expected that the URLs are for PDF files.
    """
    l = []
    s = get_soup(scraper, url)
    if s.find(text="No vote detail available for the selected legislation."):
        return []
    tables = s("table")
    vote_table = tables[6]
    rows = vote_table("tr")
    rows = rows[1:] # lose header
    for row in rows:
        tds = row("td")
        if len(tds) > 1:
            c2 = tds[1]
            chamber = c2(text=True)[0]
            links = row("a")
            if links:
                link = links[0]
                href = urljoin(url,link['href'])
                label = link(text=True)[0]
                if (not chamber_filter) or chamber_filter.lower() == chamber.lower():
                    l.append((chamber,label,href))        
    return l

Exemplo n.º 4

0

Exibir arquivo

Arquivo: votes.py Projeto: tamilyn/openstates

def get_bill_pages(scraper, url=None, doc_types=None):
    if url is None: url = legislation_url()
    """Return a sequence of tuples by retrieving all the documents described in the given url (representing
        a specific GA and session.)  Optionally filter the sequence to only the given document types ('house bill',
        'senate bill', etc.).  Each tuple returned will be in the form:
            (bill_id,short_name,status_url)
    """
    s = get_soup(scraper, url)
    links = s(
        "a", {"href": lambda x: x is not None and x.find("grplist.asp") != -1})
    links = map(lambda x: x['href'], links)
    d = {}
    for link in links:
        types = re.findall("DocTypeID=(.+?)&", link)
        for t in types:
            d.setdefault(t, []).append(urljoin(url, link))

    pages = []
    if not doc_types:
        doc_types = ['HB', 'SB']  # sane default
    for type in doc_types:
        if d.has_key(type):
            simplified_url = min_max(d[type])
            pages.extend(extract_bill_links(scraper, simplified_url))

    return pages

Exemplo n.º 5

0

Exibir arquivo

def get_all_nfl_teams(year):
    """
    Get all NFL teams and links to their season stats for a given year.
    """

    url = BASE_URL + '/years/{}/'.format(year)
    soup = get_soup(url)

    table = soup.find('table', attrs={'id': 'team_stats'})
    rows = table.find_all('tr')

    team_list = []

    for row in rows:
        team = row.find('td', attrs={'data-stat': 'team'})

        if not team:
            continue

        team_link = team.find('a')

        if not team_link:
            continue

        team_link = team_link.get('href')
        team_list.append(team_link)

    return team_list

Exemplo n.º 6

0

Exibir arquivo

def get_player_page(search=None, url=None):
    soup = get_soup(url if url else search_url.format(search=urllib.parse.quote(search)))
    log_holder = soup.find('span', text="Game Logs")
    if log_holder:
        return soup
    elif soup.findChild('div', class_='search-results'):
        nba_players = soup.find('div', attrs={"id": "players"})
        if nba_players:
            results = nba_players.findChildren('div', class_='search-item')
            if len(results) == 1:
                href = nba_players.find_next('div', class_='search-item-url').text
                return get_player_page(url=bbref_url + href)
            else:
                result_map = {}
                for result in results:
                    a = result.find_next('div', class_='search-item-name').find_next('a')
                    name = letters.sub('', a.text)
                    match = SequenceMatcher(None, search, name).ratio()
                    result_map[a.get('href')] = match
                href = sorted(result_map, key=result_map.get, reverse=True)[0]
                return get_player_page(url=bbref_url + href)
        else:
            raise NoResultsError("No NBA results for %s" % search)
    else:
        raise NoResultsError("No results for %s" % search)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_mimvp():
    """
    从http://proxy.mimvp.com/free.php 抓免费代理
    """
    querys = [
        "proxy=in_tp",
        "proxy=in_hp",
        "proxy=in_tp&sort=p_transfer",
        "proxy=in_hp&sort=p_transfer",
        "proxy=in_tp&sort=p_ping",
        "proxy=in_hp&sort=p_ping",
    ]
    proxies = []
    try:
        for query in querys:
            url = "http://proxy.mimvp.com/free.php?%s" % (query)
            soup = get_soup(url)
            table = soup.find("div", attrs={"class": "free-list"}).table
            tds = table.tbody.find_all("td")
            for i in range(0, len(tds), 10):
                ip = tds[i + 1].text
                port = img2port(tds[i + 2].img["src"])
                protocal_types = tds[i + 3]["title"].split("/")
                response_time = tds[i + 7]["title"][:-1]
                transport_time = tds[i + 8]["title"][:-1]
                proxy = "%s:%s" % (ip, port)
                if port is not None:
                    proxies += _filter_proxy(float(response_time), proxy)
    except:
        logger.warning("fail to fetch from mimvp")
    return proxies

Exemplo n.º 8

0

Exibir arquivo

Arquivo: votes.py Projeto: tamilyn/openstates

def extract_vote_pdf_links(scraper, url, chamber_filter=None):
    """Given a URL to a "votehistory.asp" page, return a sequence of tuples, each of which 
       has the form (chamber,label,url)
       
       It's expected that the URLs are for PDF files.
    """
    l = []
    s = get_soup(scraper, url)
    if s.find(text="No vote detail available for the selected legislation."):
        return []
    tables = s("table")
    vote_table = tables[6]
    rows = vote_table("tr")
    rows = rows[1:]  # lose header
    for row in rows:
        tds = row("td")
        if len(tds) > 1:
            c2 = tds[1]
            chamber = c2(text=True)[0]
            links = row("a")
            if links:
                link = links[0]
                href = urljoin(url, link['href'])
                label = link(text=True)[0]
                if (not chamber_filter
                    ) or chamber_filter.lower() == chamber.lower():
                    l.append((chamber, label, href))
    return l

Exemplo n.º 9

0

Exibir arquivo

Arquivo: Spider.py Projeto: GMTernence/weather_data

 def get_weather(self, province, city, spell):
     month_list = date_range(self.start_time, self.end_time)
     for month in month_list:
         url = self.history_url % (spell, month)
         print(url)
         weather_list = get_soup(url).find(name='div',
                                           id='content').find_all(name='tr')
         # remove the first element
         del (weather_list[0])
         for weather in weather_list:
             detail = weather.find_all(name='td')
             date = detail[0].find(
                 name='a').get('href').split('.')[0].split('/')[-1]
             date = get_all(date)
             state = detail[1].get_text()
             state = get_all(state)
             temperature = detail[2].get_text()
             temperature = get_all(temperature)
             wind = detail[3].get_text()
             wind = get_all(wind)
             print(province, city, date, state, temperature, wind)
             sql = 'INSERT INTO weather_list(weather_date, province, city, spell, state, temperature, wind) ' \
                   'values (%s, %s, %s, %s, %s, %s, %s)'
             params = [
                 date, province, city, spell, state, temperature, wind
             ]
             self.mysql.insert(sql=sql, params=params)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: bills.py Projeto: tamilyn/openstates

def extract_bill_urls_from_group(scraper, chamber, url):
    """Given a url to a page grouping bills of a certain type in a certain session,
       return a sequence of all the URLs to the specific bill statuses from that page.
    """
    s = get_soup(scraper, url)
    bill_links = s("a", {"href": re.compile(".*BillStatus.*DocTypeID")})
    bill_links = map(lambda link: urljoin(url, link['href']), bill_links)
    return bill_links

Exemplo n.º 11

0

Exibir arquivo

Arquivo: bills.py Projeto: donaldsmith2060/openstates

def extract_bill_urls_from_group(scraper, chamber,url):
    """Given a url to a page grouping bills of a certain type in a certain session,
       return a sequence of all the URLs to the specific bill statuses from that page.
    """
    s = get_soup(scraper, url)
    bill_links = s("a",{"href":re.compile(".*BillStatus.*DocTypeID")})
    bill_links = map(lambda link: urljoin(url,link['href']), bill_links)
    return bill_links

Exemplo n.º 12

0

Exibir arquivo

Arquivo: scrape_bridge_nl.py Projeto: jclangev/hello-world

def scrape_cards_bridge_nl(url: str, browser=get_browser()) -> dict:
    winds_bridge_nl = 'NWES'
    soup = get_soup(browser=browser, url=url)
    hand_tags = soup.find_all('div',
                              class_='vierkant33procent spelverdeling_hand')
    return {
        wind: hand_tag.text.split('\n')[1:5]
        for wind, hand_tag in zip(winds_bridge_nl, hand_tags)
    }

Exemplo n.º 13

0

Exibir arquivo

def get_recipes():
    soup = get_soup(BASE_URL + RECIPE_SUFFIX)
    tables = soup.find_all("table")    #16 tables
    attack = tables[11]
    parse_table(attack, "attack")
    magic = tables[12]
    parse_table(magic, "magic")
    action = tables[13]
    parse_table(action, "action")

Exemplo n.º 14

0

Exibir arquivo

Arquivo: Spider.py Projeto: GMTernence/weather_data

 def get_city(self):
     content = get_soup(self.base_url)
     province_list = content.find_all(name='table')[-1]
     province_list = province_list.find_all(name='td')
     for index, province in enumerate(province_list):
         href = province.find(name='a')
         province_name = href.get_text()
         content = get_soup(self.base_url + href.get('href'))
         content = content.find(name='div', id='content')
         city_list = content.find(name='table').find_all(name='td')
         for city in city_list:
             city_href = city.find(name='a')
             city = city_href.get_text()
             spell = city_href.get('href').split('.')[0].split('/')[-1]
             sql = 'INSERT INTO city_list(province, city, spell) values (%s, %s, %s)'
             params = [province_name, city, spell]
             print(params)
             self.mysql.insert(sql, params)
             self.get_weather(province_name, city, spell)

Exemplo n.º 15

0

Exibir arquivo

def get_avg_log_table(search, last):
    player_soup = get_player_page(search)
    career_games = int(player_soup.find('h4', class_='poptip', attrs={'data-tip': 'Games'}).find_next('p').find_next('p').text)
    name_node = player_soup.find('h1', attrs={'itemprop': 'name'})
    name = name_node.text
    if last > career_games:
        raise ValueError(f'{name} has only played {career_games} career games')
    page_id = player_soup.find('link', attrs={'rel': 'canonical'}).get('href').split('/')[-1].split('.')[0]
    log_soup = get_soup(last_url.format(page_id=page_id, last=career_games - last + 1, career=career_games))
    table = log_soup.find('table', attrs={'id': 'pgl_basic_span'}).find('tbody')
    return name, table

Exemplo n.º 16

0

Exibir arquivo

Arquivo: scrape_stepbridge.py Projeto: jclangev/hello-world

def get_stepbridge_tournament_overview_dataframe(stepbridge_user_url: str) -> pd.DataFrame:
    logged_in_browser = browser_login_stepbridge(util.get_browser())
    initial_soup = util.get_soup(browser=logged_in_browser,
                                 url=stepbridge_user_url)

    overview_page_urls = [stepbridge_user_url]
    overview_page_urls += get_other_page_urls_from_overview_page_stepbridge_my_results(initial_soup)

    result = get_all_tournament_overview_dataframe(browser=logged_in_browser,
                                                   tournament_result_overview_urls=overview_page_urls)
    return result

Exemplo n.º 17

0

Exibir arquivo

Arquivo: bills.py Projeto: donaldsmith2060/openstates

def get_all_bill_urls(scraper, chamber,session,types=None):
    """Given a session number (e.g. '96' for the 2009-2010 GA session) and a chamber,
       return all bill URLs which can be identified as associated with the given session.
       At this time, Executive Orders and Joint Session Resolutions will never be returned.
    """
    session_url = BASE_LEGISLATION_URL % session[0:2]
    s = get_soup(scraper, session_url)
    groups = extract_bill_groups(s,session_url)
    special_sessions = s(text=re.compile(".*View Special Session.*"))
    if special_sessions:
        ss_url = urljoin(session_url,special_sessions[0].parent['href'])
        ss = get_soup(scraper, ss_url)
        groups.extend(extract_bill_groups(ss,ss_url))

    urls = []
    for g in groups:
        doctype = extract_doctype(g)
        if (types is None or doctype in types) and (chamber == chamber_for_doctype(doctype)):
            urls.extend(extract_bill_urls_from_group(scraper, chamber, g))

    return urls

Exemplo n.º 18

0

Exibir arquivo

Arquivo: gen_userdict.py Projeto: hhy5277/crawler-7

def fetch_stackoverflow():
    words = []
    for pageNo in range(1, 20):
        url = 'https://stackoverflow.com/tags?page=%d&tab=popular' % (pageNo)
        soup = get_soup(url)
        tags_list = soup.find('div', attrs={'id': 'tags_list'})
        trs = tags_list.table.find_all('tr')
        for tr in trs:
            tds = tr.find_all('td')
            for td in tds:
                words.append(td.a.text)
    return words

Exemplo n.º 19

0

Exibir arquivo

Arquivo: scrape_stepbridge.py Projeto: jclangev/hello-world

def get_all_tournament_overview_dataframe(browser: mechanize.Browser,
                                          tournament_result_overview_urls: list) -> pd.DataFrame:
    result = None
    for url in tournament_result_overview_urls:
        page_soup = util.get_soup(browser=browser, url=url)
        df_tournament_results_single_page = get_tournament_overview_dataframe(page_soup)
        if result is None:
            result = df_tournament_results_single_page
        else:
            result = result.append(df_tournament_results_single_page)
    result.reset_index(drop=True, inplace=True)
    return result

Exemplo n.º 20

0

Exibir arquivo

Arquivo: gen_userdict.py Projeto: hhy5277/crawler-7

def fetch_lagou():
    words = []
    url = 'https://www.lagou.com/'
    soup = get_soup(url)
    category_list = soup.find_all('div', attrs={'class': 'menu_sub dn'})
    for category in category_list:
        dls = category.find_all('dl')
        for dl in dls:
            names = dl.dd.find_all('a')
            for name in names:
                words.append(name.text)
    return words

Exemplo n.º 21

0

Exibir arquivo

def get_all_bill_urls(scraper, chamber,session,types=None):
    """Given a session number (e.g. '96' for the 2009-2010 GA session) and a chamber,
       return all bill URLs which can be identified as associated with the given session.
       At this time, Executive Orders and Joint Session Resolutions will never be returned.
    """
    session_url = BASE_LEGISLATION_URL % session
    s = get_soup(scraper, session_url)
    groups = extract_bill_groups(s,session_url)
    special_sessions = s(text=re.compile(".*View Special Session.*"))
    if special_sessions:
        ss_url = urljoin(session_url,special_sessions[0].parent['href'])
        ss = get_soup(scraper, ss_url)
        groups.extend(extract_bill_groups(ss,ss_url))

    urls = []
    for g in groups:
        doctype = extract_doctype(g)
        if (types is None or doctype in types) and (chamber == chamber_for_doctype(doctype)):
            urls.extend(extract_bill_urls_from_group(scraper, chamber, g))
        
    return urls

Exemplo n.º 22

0

Exibir arquivo

Arquivo: gen_userdict.py Projeto: hhy5277/crawler-7

def fetch_zhipin():
    words = []
    url = 'http://www.zhipin.com/'
    soup = get_soup(url)
    job_menu = soup.find('div', attrs={'class': 'job-menu'})
    dls = job_menu.find_all('dl')
    for dl in dls:
        divs = dl.find_all('div', attrs={'class': 'text'})
        for div in divs:
            names = div.find_all('a')
            for name in names:
                words.append(name.text)
    return words

Exemplo n.º 23

0

Exibir arquivo

def get_player_log_table(search):
    player_soup = get_player_page(search)
    log_holder = player_soup.find('span', text="Game Logs")
    name_node = player_soup.find('h1', attrs={'itemprop': 'name'})
    name = name_node.text
    game_log_link_list = log_holder.find_next('div').find('ul').findChildren('a')
    game_log_link = game_log_link_list.pop()
    if 'Playoffs' in game_log_link.text:
        game_log_link = game_log_link_list.pop()
    href = game_log_link.get('href')
    log_soup = get_soup(bbref_url + href)
    table = log_soup.find('table', attrs={'id': 'pgl_basic'}).find('tbody')
    return name, table

Exemplo n.º 24

0

Exibir arquivo

Arquivo: votes.py Projeto: Empact/fiftystates

def extract_bill_links(scraper, url):
    """Given a url to a page of BillStatus links (as expected from min_max),
       return a list of tuples of the form (id, title, url)
    """
    s = get_soup(scraper, url)
    links = s("a", { "href": lambda x: x is not None and x.find("BillStatus") != -1})
    l = []
    for link in links:
        text = link(text=True)[0].replace("&nbsp;"," ")
        match = re.match("^(\S+)\s+(.+)$",text)
        if match:
            l.append((match.groups()[0],match.groups()[1],urljoin(url,link['href'])))
    return l

Exemplo n.º 25

0

Exibir arquivo

def get_highlight_lowlight_map(highlight=True):
    top_soup = get_soup(top_url)
    table = top_soup.find('table', attrs={'id': 'stats'})
    if not table:
        return None
    else:
        rows = table.find('tbody').findChildren(lambda tag: tag.name == 'tr' and not 'thead' == tag.get('class')
                                                and tag.findChild(lambda child: child.name == 'td'
                                                                  and child.get('data-stat') == 'mp'
                                                                  and int(child.text.split(':')[0]) >= 25))
    if highlight:
        return index_row(rows[0])
    else:
        return index_row(rows[-1])

Exemplo n.º 26

0

Exibir arquivo

Arquivo: votes.py Projeto: tamilyn/openstates

def extract_bill_links(scraper, url):
    """Given a url to a page of BillStatus links (as expected from min_max),
       return a list of tuples of the form (id, title, url)
    """
    s = get_soup(scraper, url)
    links = s("a",
              {"href": lambda x: x is not None and x.find("BillStatus") != -1})
    l = []
    for link in links:
        text = link(text=True)[0].replace("&nbsp;", " ")
        match = re.match("^(\S+)\s+(.+)$", text)
        if match:
            l.append((match.groups()[0], match.groups()[1],
                      urljoin(url, link['href'])))
    return l

Exemplo n.º 27

0

Exibir arquivo

Arquivo: state_shapes.py Projeto: mcbarlowe/sportypy

def get_state_shapes():
    url = 'https://www.mccurley.org/svg/data/states.svg'

    soup = get_soup(url)

    state_tags = soup.find_all('g', attrs={'statename': True})

    full_coords = pd.DataFrame()

    for tag in state_tags:
        new_coords = get_coords(tag)

        full_coords = pd.concat([full_coords, new_coords])

    return full_coords

Exemplo n.º 28

0

Exibir arquivo

def extract_versions(scraper, s):
    """Get the fulltext link from the page.
    visit it.
    get all links on that page that ref fulltext.asp
    skip the 'printer friendly' for the current page
    append '&print=true' to each of the links
    return a sequence of 2-tuples (name,link)
    """
    versions = []
    links = s("a", {"class": "legislinks", "href": re.compile(".*fulltext.asp.*")})
    if links:
        s = get_soup(scraper, urljoin(s.orig_url, links[0]['href']))
        links = s("a", {"href": re.compile(".*fulltext.asp.*"), "target": None}) # target is used for printer friendly, we'll skip that one.
        for link in links:
            versions.append((link.next, urljoin(s.orig_url,link['href'] + "&print=true")))
    return versions

Exemplo n.º 29

0

Exibir arquivo

Arquivo: bills.py Projeto: donaldsmith2060/openstates

def extract_versions(scraper, s):
    """Get the fulltext link from the page.
    visit it.
    get all links on that page that ref fulltext.asp
    skip the 'printer friendly' for the current page
    append '&print=true' to each of the links
    return a sequence of 2-tuples (name,link)
    """
    versions = []
    links = s("a", {"class": "legislinks", "href": re.compile(".*fulltext.asp.*")})
    if links:
        s = get_soup(scraper, urljoin(s.orig_url, links[0]['href']))
        links = s("a", {"href": re.compile(".*fulltext.asp.*"), "target": None}) # target is used for printer friendly, we'll skip that one.
        for link in links:
            versions.append((link.next, urljoin(s.orig_url,link['href'] + "&print=true")))
    return versions

Exemplo n.º 30

0

Exibir arquivo

def get_components(url, componentType):
    components = []
    soup = get_soup(url)
    attack_table_rows = soup.find_all("table")[10].find_all("tr")
    row_iter = iter(attack_table_rows)
    next(row_iter)		#ignore header row

    for tr in row_iter:
        items_in_tr = len(tr.findChildren())
        if items_in_tr > 13 or items_in_tr < 4:	#ignore odd rows that get detected but aren't part of the chart
            continue
        components.append(parse_component_row(tr, componentType))
        
    print("found " + str(len(components)) + " " + componentType + "...")
    with open("../db/data/khbbs/components/KHBBS" + componentType + ".json", "w") as file_pointer:
        json.dump(components, file_pointer)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_ip181():
    """
    http://www.ip181.com/
    """
    proxies = []
    try:
        url = "http://www.ip181.com/"
        soup = get_soup(url)
        table = soup.find("table")
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            tds = trs[i].find_all("td")
            ip = tds[0].text
            port = tds[1].text
            response_time = tds[4].text[:-2]
            proxy = "%s:%s" % (ip, port)
            proxies += _filter_proxy(float(response_time), proxy)
    except Exception as e:
        logger.warning("fail to fetch from ip181: %s" % e)
    return proxies

Exemplo n.º 32

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_kxdaili(page):
    """
    从http://www.kxdaili.com抓取免费代理
    """
    proxies = []
    try:
        url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
        soup = get_soup(url)
        table_tag = soup.find("table", attrs={"class": "segment"})
        trs = table_tag.tbody.find_all("tr")
        for tr in trs:
            tds = tr.find_all("td")
            ip = tds[0].text
            port = tds[1].text
            latency = tds[4].text.split(" ")[0]
            proxy = "%s:%s" % (ip, port)
            proxies += _filter_proxy(float(latency), proxy)
    except:
        logger.warning("fail to fetch from kxdaili")
    return proxies

Exemplo n.º 33

0

Exibir arquivo

def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill

Exemplo n.º 34

0

Exibir arquivo

Arquivo: bills.py Projeto: donaldsmith2060/openstates

def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill

Exemplo n.º 35

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_xici():
    """
    http://www.xicidaili.com/nn/
    """
    proxies = []
    try:
        url = "http://www.xicidaili.com/wt/"
        soup = get_soup(url)
        table = soup.find("table", attrs={"id": "ip_list"})
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            tr = trs[i]
            tds = tr.find_all("td")
            ip = tds[1].text
            port = tds[2].text
            speed = tds[6].div["title"][:-1]
            latency = tds[7].div["title"][:-1]
            if float(speed) < 0.5 and float(latency) < 1.0:
                proxies.append("%s:%s" % (ip, port))
    except:
        logger.warning("fail to fetch from xici")
    return proxies

Exemplo n.º 36

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_ip002(page=1):
    """
    http://www.ip002.net/free.html
    """
    proxies = []
    try:
        url = "http://www.ip002.net/free_%d.html" % page
        soup = get_soup(url)
        table = soup.find(
            "table", attrs={"class": "table table-bordered table-hover"})
        trs = table.tbody.find_all("tr")
        for i in range(2, len(trs)):
            tr = trs[i]
            tds = tr.find_all("td")
            ip = tds[1].text
            port = tds[2].text
            response_time = tds[4].text.split("/")[0]
            proxy = "%s:%s" % (ip, port)
            proxies += _filter_proxy(float(response_time) / 1000.00, proxy)
    except:
        logger.warning("failed to fetch ip002")
    return proxies

Exemplo n.º 37

0

Exibir arquivo

Arquivo: free_proxy.py Projeto: hhy5277/crawler-7

def fetch_httpdaili():
    """
    http://www.httpdaili.com/mfdl/
    更新比较频繁
    """
    proxies = []
    try:
        url = "http://www.httpdaili.com/mfdl/"
        soup = get_soup(url)
        table = soup.find("div", attrs={"kb-item-wrap11"}).table
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            try:
                tds = trs[i].find_all("td")
                ip = tds[0].text
                port = tds[1].text
                type = tds[2].text
                proxies.append("%s:%s" % (ip, port))
            except:
                pass
    except Exception as e:
        logger.warning("fail to fetch from httpdaili: %s" % e)
    return proxies

Exemplo n.º 38

0

Exibir arquivo

Arquivo: ptt_page.py Projeto: tp6vup54/telegrambot_yomaobot

 def __init__(self, page_sub_url):
     self.soup = get_soup(page_sub_url)

Exemplo n.º 39

0

Exibir arquivo

Arquivo: ptt_article.py Projeto: tp6vup54/telegrambot_yomaobot

 def __init__(self, url):
     self.url = url
     print('article url: ' + url)
     self.soup = get_soup(url, is_sub=False)

Exemplo n.º 40

0

Exibir arquivo

Arquivo: obj.py Projeto: santos82/LFS201

import bs4
import re
import util
import os

abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

out="out/Objetivos.html"

soup=util.get_soup("out/LFS201.html")
soup.title.string=soup.title.string+": Objetivos"

flds=soup.findAll("fieldset", attrs={'class': re.compile(r".*\bn2\b.*")})
for f in flds:
	if f.legend.get_text().strip().lower()=="objetivos de aprendizaje":
		f.legend.string=f.parent.h1.a.string
		f.div.p.extract()
	else:
		f.extract()

for h in soup.findAll("h1"):
	h.extract()

for div in soup.body.div.select(" > div"):
	div.unwrap()

h=unicode(soup)
with open(out, "wb") as file:
	file.write(h.encode('utf8'))

Exemplo n.º 41

0

Exibir arquivo

Arquivo: ptt_board.py Projeto: tp6vup54/telegrambot_yomaobot

 def get_max_page_index(self):
     soup = get_soup(vars.init_url[self.name])
     maxpage = soup.find('div', {'class' : 'btn-group btn-group-paging'}).findAll('a')[1]['href'].replace(vars.url_ending, '')
     self.max_page_index = int(maxpage[maxpage.index('index') + 5:]) + 1

Exemplo n.º 42

0

Exibir arquivo

Arquivo: join.py Projeto: santos82/LFS201

	i.attrs["class"]="item"
	i.append(a)

def get_lab(f,txt):
	a=soup.new_tag("a",  **{"href": "labs/"+f, "title":"Fichero original en: https://lms.360training.com/custom/12396/808239/"+f})
	a.string=txt
	return a

soup = util.get_tpt("LFS201","rec/lfs201.css")

fldB=None
divCp=None

hts=sorted(glob.glob('html/clean/*.html'))
for ht in hts:
	soup2 = util.get_soup(ht)
	t=soup2.title
	b=soup2.body

	if "_popup" in ht:
		n=3
	else:
		ca=int(cp.sub("\\1",ht))
		if ca>caB:
			n=1
			f=1
			caB=ca
		else:
			n=2
			f=f+1