예제 #1
0
def get_divisions():
    """Scrape the list of divisions and major groups from the OSHA website
    Divisions are the broadest grouping of SIC codes provided by OSHA
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + 'sic_manual.html')

    # Find content
    container = soup.select('div#maincontain')[0]
    master_list = container.find('div').find('ol')
    all_links = master_list.find_all('a')

    # Store cleaned descriptions, from aref elements
    divisions = []
    for i in range(0, len(all_links)):

        # Store full desciption provided by site and keep the associated link
        l = all_links[i]
        full_desc = str(l.contents[0]).strip().encode("utf-8")
        link = l.get('href').encode("utf-8")

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'Major Group'):
            parent_desc = get_parent(divisions, i, 'Major Group', 'Division')
        else:
            parent_desc = str(None)

        # Add to running list of named tuples
        divisions.append(ind_group(full_desc, parent_desc, link))

    return divisions
예제 #2
0
파일: main.py 프로젝트: letianccc/scratch
def explore_book(allbook, url, conn, cursor):
    sleep(1)
    soup = get_soup(url)
    book = get_book(soup, url)
    print(book.name, book.score, book.url)
    insert_explored_book(book, conn, cursor)

    if is_target_book(allbook, book):
        update_allbook(allbook, book, conn, cursor)

    urls = urls_for_more_books(soup)
    for url in urls:
        if not_explored(url, cursor):
            explore_book(allbook, url, conn, cursor)
예제 #3
0
def get_major(url_ext):
    """Scrape the list of major groups, industry groups and SIC four-digit SIC codes
    from the OSHA website
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    Industry groups are the third broadest grouping (least granular) of SIC
    codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + url_ext)

    # Isolate relevant content
    container = soup.select('div#maincontain')[0]
    groups = container.find_all(['strong', 'li'])
    major_desc = str(container.find_all('h2')[0].contents[0])

    # Store cleaned descriptions, from strong and li elements
    majors = []
    for i in range(0, len(groups)):

        g = groups[i]

        # Get description of SIC and industry groups
        if g.name == 'strong':
            # Get industry group descriptions
            full_desc = g.contents[0].strip().encode("utf-8")
            link = None
        elif g.name == 'li':
            # Get four-digit SIC code descriptions
            full_desc = 'SIC4 ' + str(g.contents[0]).strip() + \
                ': ' + str(g.contents[1].contents[0]).strip()
            link = g.contents[1].get('href').encode("utf-8")
        else:
            # Otherwise raise a value error
            raise ValueError('Unexpected element type: ' + g.name)

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'SIC4'):
            parent_desc = get_parent(majors, i, 'SIC4', 'Industry Group')
        else:
            parent_desc = major_desc

        # Add to running list of named tuples
        majors.append(ind_group(full_desc, parent_desc, link))

    return majors
예제 #4
0
def get_youtube_info(url):
    if 'channel' in url:
        tag = 'channel'
    elif 'playlist' in url:
        tag = 'playlist'
    elif 'user' in url:
        tag = 'user'
    else:
        return {}
    sep = '=' if tag == 'playlist' else '/'
    id_ = url.split(sep)[-1]
    prefix = 'https://www.youtube.com/feeds/videos.xml?'
    postfix = '_id=' if tag != 'user' else '='
    xml = prefix + tag + postfix + id_
    soup = get_soup(xml)
    name = soup.find('title').text
    return {
        'name': name,
        'tag': tag,
        'rss': xml
    }
예제 #5
0
def get_sic_sec():
    """Scrape SIC codes from SEC website
    """

    # Setup
    soup = get_soup(config.SEC_base_url)
    table = soup.find_all('table')[3]

    # Convert HTML to nested list
    data = []
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        cols = [ele.text.strip().replace('  ', ' ') for ele in cols]
        if (len(cols) > 1):
            data.append([ele.encode('utf-8') for ele in cols if ele])

    # Clean headers
    if data[0] != config.SEC_expected_columns:
        warnings.warn('Warning: column names have changed in URL ' +
                      config.SEC_base_url)
    data[0] = config.SEC_columns

    return data
예제 #6
0
import soup as s

url = "https://www.nytimes.com/"

if __name__ == "__main__":
    print(
        "Note:\n"
        "\tThe html is now very strange "
        "and there is no sense to check whether this is 100% accurate.\n"
        "\tThe structure and tags will be different in a couple of years anyway.\n\n"
    )

    soup = s.get_soup(url)

    print(*s.get_all_tags(soup, 'span'), sep="\n")
    print(*s.get_all_tags(soup, 'h2'), sep="\n")
예제 #7
0
 def __init__(self, url, last_date, item_name='item'):
     self.soup = get_soup(url)
     self.item = item_name
     self.last_date = last_date