Python get_soup 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: soup

메소드/함수: get_soup

hotexamples.com에서의 예제들: 7

Python get_soup - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 soup.get_soup에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def get_divisions():
    """Scrape the list of divisions and major groups from the OSHA website
    Divisions are the broadest grouping of SIC codes provided by OSHA
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + 'sic_manual.html')

    # Find content
    container = soup.select('div#maincontain')[0]
    master_list = container.find('div').find('ol')
    all_links = master_list.find_all('a')

    # Store cleaned descriptions, from aref elements
    divisions = []
    for i in range(0, len(all_links)):

        # Store full desciption provided by site and keep the associated link
        l = all_links[i]
        full_desc = str(l.contents[0]).strip().encode("utf-8")
        link = l.get('href').encode("utf-8")

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'Major Group'):
            parent_desc = get_parent(divisions, i, 'Major Group', 'Division')
        else:
            parent_desc = str(None)

        # Add to running list of named tuples
        divisions.append(ind_group(full_desc, parent_desc, link))

    return divisions

예제 #2

파일 보기

파일: main.py 프로젝트: letianccc/scratch

def explore_book(allbook, url, conn, cursor):
    sleep(1)
    soup = get_soup(url)
    book = get_book(soup, url)
    print(book.name, book.score, book.url)
    insert_explored_book(book, conn, cursor)

    if is_target_book(allbook, book):
        update_allbook(allbook, book, conn, cursor)

    urls = urls_for_more_books(soup)
    for url in urls:
        if not_explored(url, cursor):
            explore_book(allbook, url, conn, cursor)

예제 #3

파일 보기

def get_major(url_ext):
    """Scrape the list of major groups, industry groups and SIC four-digit SIC codes
    from the OSHA website
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    Industry groups are the third broadest grouping (least granular) of SIC
    codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + url_ext)

    # Isolate relevant content
    container = soup.select('div#maincontain')[0]
    groups = container.find_all(['strong', 'li'])
    major_desc = str(container.find_all('h2')[0].contents[0])

    # Store cleaned descriptions, from strong and li elements
    majors = []
    for i in range(0, len(groups)):

        g = groups[i]

        # Get description of SIC and industry groups
        if g.name == 'strong':
            # Get industry group descriptions
            full_desc = g.contents[0].strip().encode("utf-8")
            link = None
        elif g.name == 'li':
            # Get four-digit SIC code descriptions
            full_desc = 'SIC4 ' + str(g.contents[0]).strip() + \
                ': ' + str(g.contents[1].contents[0]).strip()
            link = g.contents[1].get('href').encode("utf-8")
        else:
            # Otherwise raise a value error
            raise ValueError('Unexpected element type: ' + g.name)

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'SIC4'):
            parent_desc = get_parent(majors, i, 'SIC4', 'Industry Group')
        else:
            parent_desc = major_desc

        # Add to running list of named tuples
        majors.append(ind_group(full_desc, parent_desc, link))

    return majors

예제 #4

파일 보기

def get_youtube_info(url):
    if 'channel' in url:
        tag = 'channel'
    elif 'playlist' in url:
        tag = 'playlist'
    elif 'user' in url:
        tag = 'user'
    else:
        return {}
    sep = '=' if tag == 'playlist' else '/'
    id_ = url.split(sep)[-1]
    prefix = 'https://www.youtube.com/feeds/videos.xml?'
    postfix = '_id=' if tag != 'user' else '='
    xml = prefix + tag + postfix + id_
    soup = get_soup(xml)
    name = soup.find('title').text
    return {
        'name': name,
        'tag': tag,
        'rss': xml
    }

예제 #5

파일 보기

파일: scrape_sic_sec.py 프로젝트: pfg3/SIC-list

def get_sic_sec():
    """Scrape SIC codes from SEC website
    """

    # Setup
    soup = get_soup(config.SEC_base_url)
    table = soup.find_all('table')[3]

    # Convert HTML to nested list
    data = []
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        cols = [ele.text.strip().replace('  ', ' ') for ele in cols]
        if (len(cols) > 1):
            data.append([ele.encode('utf-8') for ele in cols if ele])

    # Clean headers
    if data[0] != config.SEC_expected_columns:
        warnings.warn('Warning: column names have changed in URL ' +
                      config.SEC_base_url)
    data[0] = config.SEC_columns

    return data

예제 #6

파일 보기

파일: e17.py 프로젝트: abuyinn/practice-python

import soup as s

url = "https://www.nytimes.com/"

if __name__ == "__main__":
    print(
        "Note:\n"
        "\tThe html is now very strange "
        "and there is no sense to check whether this is 100% accurate.\n"
        "\tThe structure and tags will be different in a couple of years anyway.\n\n"
    )

    soup = s.get_soup(url)

    print(*s.get_all_tags(soup, 'span'), sep="\n")
    print(*s.get_all_tags(soup, 'h2'), sep="\n")

예제 #7

파일 보기

 def __init__(self, url, last_date, item_name='item'):
     self.soup = get_soup(url)
     self.item = item_name
     self.last_date = last_date