Python convert_if_relative_url 예제들, util.convert_if_relative_url Python 예제들

예제 #1

0

파일 보기

def find_tag_list(url, soup_object, request_obj, limiting_d):
    '''
    Function finds the list of hyperlink tags from given URL and collects
    the list of URLs. 

    Inputs:
        url (string): URL to find list of further URLs from
        soup_object (soup object): Soup object obtained from using 
        Beautiful Soup on given URL
        request_obj (request object): Request object obtained from the functions 
        given in PA to obtain request object from URL 
        limiting_d (string): Limiting domain for the URLs

    Outputs:
        url_list (list): List containing all URLs that are able
        to be followed from the given URL. 

    '''
    https_url = util.get_request_url(request_obj)
    ahref_tag_list = soup_object.find_all('a', href = True)

    url_list = []

    for tag in ahref_tag_list:
        this_url = tag['href']
        newest_url = util.convert_if_relative_url(https_url, this_url)
        if util.is_url_ok_to_follow(newest_url, limiting_d):
            url_list.append(newest_url)
    
    return url_list

예제 #2

0

파일 보기

def queue_children_sites(starting_url, queue):
    '''Given a url and a queue, adds all children urls
     of the start point to the queue

     Inputs: starting_url -- string that corresponds to a url
     queue -- queue.Queue object

     Outputs: None, queue is modified
     in place to contain all child urls'''

    if starting_url[4] == 's':
        pass
    else:
        starting_url = starting_url[:4] + 's' + starting_url[4:]
    #turns http to https if not already
    request = util.get_request(starting_url)
    assert request != None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")
    URLs = soup.find_all("a")
    URLs = [URL["href"] for URL in URLs if URL.has_attr("href")]
    children = []
    for URL in URLs:
        if util.is_absolute_url(URL):
            children.append(URL)
        else:
            URL = util.convert_if_relative_url(starting_url, URL)
            children.append(URL)

    children = [
        child for child in children
        if util.is_url_ok_to_follow(child, limiting_domain)
    ]
    for child in children:
        queue.put(child)

예제 #3

0

파일 보기

def get_restaurant_links_chicago():
    # start from searching "Restaurant", "Chicago" from yelp main page
    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []

    for url in url_list:

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        # extract href links to restaurants
        links = []
        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            # Hardcoded filter
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link)

    return links

예제 #4

0

파일 보기

파일: teenlife_crawler.py 프로젝트: DGV98/Scraper

def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain):
    '''
    Crawl the college catalog and adds to an index dictionary to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: dictionary that maps words to course identifiers
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain)
    tag_list = soup.find_all("ul", attrs = {"class": "pagination"})
    current_page = tag_list[0].find_all("li", attrs = {"class": "current"})
    next_page = current_page[0].next_sibling.next_sibling.findChild()
    next_page_href = next_page.get('href')
    next_page_href = util.convert_if_relative_url(post_url, next_page_href)
    page_parser_q.put(next_page_href)

예제 #5

0

파일 보기

def find_links(soup, url, post_url, pull_info_q, links_visited,
               limiting_domain):
    '''
    Adds links to be visited to the queue 'q' and adds links visited to the list
    'links_visited.'

    Inputs:
        soup: soup object from the text of the HTML document
        url: starting url to begin crawling with
        post_url: this is the processed absolute url
        q: queue of urls that is being added to for each url crawled
        links_visited: list of visited links
        limiting_domain: domain name
    '''
    tag_list = soup.find_all(
        "div", attrs={"class": "Grid Grid--SpacingResponsiveLarge"})
    link_list = tag_list[0].find_all('h3', {"class": "Card__Title"})
    for link in link_list:
        possible_link = link.findChild().get("href")
        # possible_link = parsing_default_domain + possible_link
        actual_link = util.convert_if_relative_url(post_url, possible_link)
        if actual_link is not None and actual_link not in links_visited:
            if util.is_url_ok_to_follow(actual_link, limiting_domain):
                pull_info_q.put(actual_link)
    links_visited.append(url)
    if post_url != url:
        links_visited.append(post_url)

예제 #6

0

파일 보기

파일: search.py 프로젝트: owencummings/school-projects

def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit):
    urls = Queue.Queue()
    visited = []
    index = {}

    def search(word):
        rv = []
        matches = []
        words = re.findall("[a-zA-Z]\w*", word)
        if len(words) == 0:
            return []
        for url in index.keys():
            for title in index[url].keys():
                for word in words:
                    word = word.lower()
                    if word in title or word in index[url][title]:
                        matches.append((title, url))
        for pair in matches:
            if matches.count(pair) == len(words):
                if pair not in rv:
                    rv.append(pair)
        return rv

    if util.is_url_ok_to_follow(starting_url, limiting_domain):
        urls.put(starting_url)
        while not urls.empty() and len(visited) < max_num_pages_to_visit:
            top_queue = urls.get()
            if top_queue not in visited and util.is_url_ok_to_follow(
                    top_queue, limiting_domain):
                request = util.get_request(top_queue)
                if request == None:
                    visited.append(top_queue)
                    continue
                new_page = util.get_request_url(request)
                if new_page != top_queue:
                    if new_page not in visited:
                        visited.append(new_page)
                        top_queue = new_page
                data = bs4.BeautifulSoup(util.read_request(request))
                visited.append(top_queue)
                index = indexer(index, top_queue, data)
                for link in data.find_all('a'):
                    href = link.get('href')
                    if href == None:
                        continue
                    href = util.remove_fragment(href)
                    if not util.is_absolute_url(href):
                        url = util.convert_if_relative_url(top_queue, href)
                    urls.put(url)
    else:
        return None
    return search

예제 #7

0

파일 보기

파일: baseball.py 프로젝트: jlu00/databaseball

def url_check(url, parent_url):
    '''
    Takes a url, the limiting domain, and its parent url, and does various checks on the url, 
    returning the url in the correct format if it is ok to use and returning None if not.
    '''

    if not util.is_absolute_url(url):
        url = util.convert_if_relative_url(parent_url, url)
    url = util.remove_fragment(url)
    if url:
        return url
    else:
        return None

예제 #8

0

파일 보기

def get_restaurant_links_cook():
    cities = get_cities()

    city_state = get_loc_cook()
    new_city_state = []
    for ele in city_state:
        if ele[0] in cities:
            new_city_state.append(ele)

    page_suffix = [i for i in range(0, 231, 10)]
    #print(city_state)

    url_list = []
    for city, state in city_state:
        html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace(
            " ", "") + "%2C%20" + state
        for suffix in page_suffix:
            html_page = html + "&start=" + str(suffix)
            url_list.append(html_page)
    r'''
    with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file:
        write_file.writelines(url_list)

        write_file.close()
    '''

    url_list = [
        "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190"
    ]
    for url in url_list:
        request = util.get_request(url)
        if request:

            text = util.read_request(request)

            soup = bs4.BeautifulSoup(text, "html5lib")
            tags = soup.find_all('a', href=True, target="", role="")

            # extract href links to restaurants
            links = []
            for tag in tags:
                link = tag['href']
                link = util.convert_if_relative_url(url, link)
                link = util.remove_fragment(link)
                # Hardcoded filter
                if link[-11:] == "Restaurants":
                    if tag["name"] != '':
                        if link not in links:
                            links.append(link + "\n")
    return links

예제 #9

0

파일 보기

def calendar_scraper(url, limiting_domain):
    '''
    Extracts links from a given url.

    Inputs:
        url - (string) url from which to get 
        limiting_domain: (string) that links must match
        visited_links: (list) of already visited sites 
    Outputs:
        links - list of strings, non-repetead and not previously visited
                links
        soup - soup object corresponding to visited url (to be used for 
                getting words)
    '''
    #A. Extracting links
    req = util.get_request(url)
    url2 = util.get_request_url(req)
    soup = make_soup(req)

    if soup:
        cal = []
        cal_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)

        art = []
        art_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)
    return cal, art

예제 #10

0

파일 보기

def get_restaurant_links():
    '''
    Start from searching "Restaurant", "Chicago" on yelp main page,
    and collect all restaurant links from 24 pages

    Input:
        None

    Output:
        links (list): a list of links
    '''

    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []
    count = 0

    for url in url_list:
        count += 1
        print(count)

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link + "\n")
                        print(link)

        i = 5 + random.random() * 5
        time.sleep(i)

    return links

예제 #11

0

파일 보기

파일: scrape.py 프로젝트: ccr122/ccr

def scrape():
    '''
    performs entire scraping function
    outputs:
        index: dictionary of museum/exhibit information 
    '''
    index = {}
    for museum_id in scrape_dict:
        limiter = scrape_dict[museum_id]['limiter']
        pages = scrape_dict[museum_id]['page']
        exhibit_urls = []
        for page in pages:
            r = requests.get(page)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            for link in soup.find_all('a', href=True):
                u = util.convert_if_relative_url(page, link['href'])
                u = util.remove_fragment(u)
                restr = scrape_dict[museum_id]['restr']
                crawl(limiter, exhibit_urls, u, restr)

        index[museum_id] = {}
        exhibit_id = museum_id + '01'

        for link in exhibit_urls:
            r = requests.get(link)
            soup = bs4.BeautifulSoup(r.text,"html5lib")
            print(link)
            try:
                index[museum_id][exhibit_id] = {}
                scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id)
                index[museum_id][exhibit_id]['url'] = link
                exhibit_id = '00' + str(int(exhibit_id) + 1)
            except:
                print('\t^^ Scraper Failed')

        
        with open('../csvs/musid_name.csv','w') as f:
            line = 'mus_id|name' + '\n'
            f.write(line)
            for mus_id in scrape_dict:
                line = '{}|{}\n'.format(str(mus_id), \
                    scrape_dict[mus_id]['name'])
                f.write(line) 

    return index

예제 #12

0

파일 보기

파일: scrape.py 프로젝트: ccr122/ccr

def scrape():
    '''
    performs entire scraping function
    outputs:
        index: dictionary of museum/exhibit information 
    '''
    index = {}
    for museum_id in scrape_dict:
        limiter = scrape_dict[museum_id]['limiter']
        pages = scrape_dict[museum_id]['page']
        exhibit_urls = []
        for page in pages:
            r = requests.get(page)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            for link in soup.find_all('a', href=True):
                u = util.convert_if_relative_url(page, link['href'])
                u = util.remove_fragment(u)
                restr = scrape_dict[museum_id]['restr']
                crawl(limiter, exhibit_urls, u, restr)

        index[museum_id] = {}
        exhibit_id = museum_id + '01'

        for link in exhibit_urls:
            r = requests.get(link)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            print(link)
            try:
                index[museum_id][exhibit_id] = {}
                scrape_dict[museum_id]['info'](soup, index[museum_id],
                                               exhibit_id)
                index[museum_id][exhibit_id]['url'] = link
                exhibit_id = '00' + str(int(exhibit_id) + 1)
            except:
                print('\t^^ Scraper Failed')

        with open('../csvs/musid_name.csv', 'w') as f:
            line = 'mus_id|name' + '\n'
            f.write(line)
            for mus_id in scrape_dict:
                line = '{}|{}\n'.format(str(mus_id), \
                    scrape_dict[mus_id]['name'])
                f.write(line)

    return index

예제 #13

0

파일 보기

파일: PLOS_webcrawler.py 프로젝트: peterlaurin/group_2

def get_next_page(soup_object, current_url):
    """
    Takes a PLOS url page of articles and gets
    the next page

    Inputs:
        current_url (string): url for the Soup object
        soup_object (Soup Object)
    Returns:
        next page url (string)
    """

    next_page_url = soup_object.find_all("a", id="nextPageLink")
    if next_page_url:
        next_page_url = next_page_url[0]["href"]
        next_page_url = util.convert_if_relative_url(current_url,
                                                     next_page_url)
        return next_page_url
    else:
        return None

예제 #14

0

파일 보기

파일: crawler.py 프로젝트: pinderk/SampleWork

def clean_url(url, limiting_domain, parent_url):
    '''
    Cleans the given url, if necessary.

    Inputs:
        url: (string) A url
        limiting_domain: (string) The limiting domain of the url.
        parent_url: (string) The partent url if the given url is incomplete.

    Outputs:
        The cleaned url if it is ok to follow, and None otherwise.  
    '''

    c_url = util.remove_fragment(url)
    c_url = util.convert_if_relative_url(parent_url, c_url)

    if util.is_url_ok_to_follow(c_url, limiting_domain):
        return c_url

    return None

예제 #15

0

파일 보기

파일: oscars_scraper.py 프로젝트: AhmedKarkoura/CS122

def get_category_urls(current_category_url):
    '''
    From URL with current categories, get url for each category
    Input: 
        current_category_url: url for current categories 

    Return:
        list of urls with each url containing awards information for one 
        category
    '''
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html5lib")
    links = soup.find('div',
                      class_="div-col columns column-width").find_all('a')
    urls = []
    for link in links:
        category_relative_url = link['href']
        category_url = util.convert_if_relative_url(url, category_relative_url)
        urls.append(category_url)

    return urls

예제 #16

0

파일 보기

파일: PLOS_webcrawler.py 프로젝트: peterlaurin/group_2

def get_PLOS_subject_urls(starting_url):
    """
    Parses the PLOS starting url and extracts the
    urls for each subject area.

    Inputs:
        starting_url (string)

    Returns:
        list of urls for each subject area in PLOS
    
    """
    soup = get_soup_object(starting_url)
    soup_dropdown_menu_lst = soup.find_all("ul", typeof="v:Breadcrumb")
    soup_subject_urls_lst = soup_dropdown_menu_lst[0].find_all("a")
    subject_urls_lst = []
    for a_tag in soup_subject_urls_lst[1:]:
        url = util.convert_if_relative_url(starting_url, a_tag["href"])
        subject_urls_lst.append(url)

    return subject_urls_lst

예제 #17

0

파일 보기

def queue_urls(url, soup, queue, limiting_domain):
    '''
    Forms a queue of all the urls

    Inputs:
        url: the url to put into the queue
        soup: BeautifulSoup object
        queue: the existing queue
        limiting_domain: a domain with which to stay in when queuing

    Outputs:
        None
    '''
    for link in soup.find_all('a'):
        clean_url = util.convert_if_relative_url(
            url, util.remove_fragment(link.get('href')))

        if util.is_absolute_url(clean_url) and str(clean_url)[0] != 'b':
            if (util.is_url_ok_to_follow(
                    clean_url,
                    limiting_domain)) and clean_url not in queue.all_items:
                queue.enqueue(clean_url)

예제 #18

0

파일 보기

파일: tvnz_crawler.py 프로젝트: fmninja/new-zealand-epu

def get_all_links(soup, url):
    '''
    funcion take a soup object  and will give a list of all relevent urls/links
    for this assignmnet in a list
    Input:
        soup (bs4 object): a bs4 soup object of the wabe page
        url (strl): the url from the bs4 soup was construted,
                    (This is required because to convert into absolute url)
    Output:
        all_links (list): a list of ready to go urls
    '''
    all_links = []

    mega_set = soup.find_all('a')
    for link in mega_set:
        if link.has_attr('href'):
            link = util.remove_fragment(link['href'])
            abs_link = util.convert_if_relative_url(url, link)
            if abs_link not in all_links:
                all_links.append(abs_link)

    return all_links

예제 #19

0

파일 보기

파일: PLOS_webcrawler.py 프로젝트: peterlaurin/group_2

def go(num_articles_to_crawl, start_page, database_name):
    '''
    Crawl the PLOS One and generate an SQL database. Automatically samples even
    number of articles from each subject area based on num_articles_to_crawl.

    Inputs:
        num_articles_to_crawl (int): theapproximate number of articles to process during the crawl
        start_page (int): page of PLOS One subject browsing to start crawling at
        database_name (string): name of database to add to 
    '''
    urls_visited = set()
    starting_url = ("https://journals.plos.org/plosone/browse")
    subject_urls_lst = get_PLOS_subject_urls(starting_url)

    num_articles_per_field = num_articles_to_crawl / 11
    num_pages_per_field = math.ceil(num_articles_per_field / 13)

    for subject_url in subject_urls_lst:
        field = get_field(subject_url)
        subject_url += '?page=' + str(start_page)
        subject_soup = get_soup_object(subject_url)
        current_url = subject_url
        urls_visited.add(current_url)
        for _ in range(num_pages_per_field):
            soup_article_lst = subject_soup.find_all("h2", class_="title")
            for soup_article in soup_article_lst:
                article_url = util.convert_if_relative_url(
                    current_url,
                    soup_article.find_all("a")[0]["href"])
                if article_url not in urls_visited:
                    urls_visited.add(article_url)
                    process_article(article_url, field, database_name)
            current_url = get_next_page(subject_soup, subject_url)
            if not current_url:
                break
            subject_soup = get_soup_object(current_url)
            urls_visited.add(current_url)

    return

예제 #20

0

파일 보기

파일: crawler.py 프로젝트: lbvalcke/Classwork

def get_clean_urls(url, limiting_domain):
    '''
    Given a soup for a webpage, create and return a list of all 
    'a' tag urls in that soup that have been cleaned (absolute urls only)
    and 'ok' to follow

    Inputs:
        url - absolute url 
        limiting_domain - domain name
        used_links - list of links already visited
    Outputs:
        list of absolute urls
    '''
    soup = get_soup_from_url(url)
    all_a_tags = soup.find_all("a")
    # get urls (i.e. if tag has 'href' attribute)
    clean_urls = []
    for tag in all_a_tags:
        if tag.has_attr('href'):
            absolute = util.convert_if_relative_url(url, tag['href'])
            if not util.is_url_ok_to_follow(absolute, limiting_domain):
                continue
            util.remove_fragment(absolute)
            # protocol field reversion
            temp_request = util.get_request(absolute)
            if temp_request == None:
                continue
            reverted_url = util.get_request_url(temp_request)
            # is url ok to follow based on specification in PA2
            if util.is_url_ok_to_follow(reverted_url, limiting_domain):
                clean_urls.append(reverted_url)
    # remove duplicates
    final_url_list = []
    for link in clean_urls:
        if link not in final_url_list:
            final_url_list.append(link)

    return final_url_list

예제 #21

0

파일 보기

파일: League_Nations.py 프로젝트: asudit/BA_Thesis

def get_neighbors(node):
    print("      completed")
    neighbors = []
    test_link = []
    response = util.get_request(node)
    if response == None:
        print('No response')
        neighbors = None
    else:
        text = util.read_request(response)
        if text == "":
            print("No text read")
            neighbors = None
        else:
            soup = bs4.BeautifulSoup(text, "html5lib")
            for link in soup.find_all("a"):
                url_raw = link.get("href")
                url_rel = util.remove_fragment(url_raw)
                url = util.convert_if_relative_url(node, url_rel)
                print(url)
                if url != None:
                    neighbors.append(url)
    return neighbors, response, soup

예제 #22

0

파일 보기

파일: teenlife_crawler.py 프로젝트: DGV98/Scraper

def find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain):
    '''
    Adds links to be visited to the queue 'q' and adds links visited to the list
    'links_visited.'

    Inputs:
        soup: soup object from the text of the HTML document
        url: starting url to begin crawling with
        post_url: this is the processed absolute url
        q: queue of urls that is being added to for each url crawled
        links_visited: list of visited links
        limiting_domain: domain name
    '''
    tag_list = soup.find_all("div", attrs = {"class":"search-listing-content"})
    for tag in tag_list:
        href_tag = tag.findChild()
        possible_link = href_tag.get('href')
        actual_link = util.convert_if_relative_url(post_url, possible_link)
        if actual_link is not None and actual_link not in links_visited:
            if util.is_url_ok_to_follow(actual_link, limiting_domain):
                pull_info_q.put(actual_link)
    links_visited.append(url)
    if post_url != url:
        links_visited.append(post_url)

예제 #23

0

파일 보기

def movie_level_data(index_filename, i=0):
    '''
    Scrapes all movie level data using urls scraped from the all movies page
    '''
    s = 'SELECT title, movie_id, url FROM all_page'
    db = sqlite3.connect('sql_db_files/rotten_tomatoes.db')

    c = db.cursor()
    r = c.execute(s)

    urls = r.fetchall()[:i + 1]
    urls = urls[::-1]

    # Replace the above two lines with the below for forward scraping
    # urls = r.fetchall()[i:]

    db.close()

    current_url = 'https://www.rottentomatoes.com/'

    with open('movie_level_files/' + index_filename + '_' + str(i) + '.csv',
              'w') as csvfile:

        writer = csv.writer(csvfile, delimiter='|')

        for title, movie_id, url in urls:
            url = util.convert_if_relative_url(current_url, url)

            print(i, title, movie_id)

            i -= 1
            # Replace the above for forward scraping
            # i += 1

            r = url_request(index_filename, i, url)
            html = r.read()
            soup = bs4.BeautifulSoup(html, features='html5lib')

            movie = data_collector(soup)

            if movie.get('Runtime:'):
                movie['Runtime:'] = movie.get('Runtime:').strip('minutes ')

            if movie.get('In Theaters:'):
                movie['In Theaters:'] = re.search(
                    '[a-zA-Z]{3} [0-9,]+ [0-9]{4}',
                    movie.get('In Theaters:')).group()

            if movie.get('Box Office:'):
                movie['Box Office:'] = movie.get('Box Office:').strip()

            writer.writerow([
                movie_id, title,
                movie.get('Directed By:'),
                movie.get('Genre:'),
                movie.get('In Theaters:'),
                movie.get('On Disc/Streaming:'),
                movie.get('Box Office:'),
                movie.get('Rating:'),
                movie.get('Runtime:'),
                movie.get('Studio:'),
                movie.get('Written By:'),
                movie.get('full_synop'),
                movie.get('all_reviewers_average'),
                movie.get('num_reviewers'),
                movie.get('all_fresh'),
                movie.get('all_rotten'),
                movie.get('top_reviewers_average'),
                movie.get('num_top_reviewers'),
                movie.get('top_fresh'),
                movie.get('top_rotten'),
                movie.get('user_rating'),
                movie.get('num_users')
            ])

예제 #24

0

파일 보기

파일: crawler.py 프로젝트: noahpselman/rapbot

def get_lyrics(cnx, cursor, url=STARTING_URL, visited_pages=set()):

	print("\ntrying", url)
	visited_pages.add(url)
	print("visited_pages length: {}".format(len(visited_pages)))

	try:
		url, soup = open_page(url)

	except Exception:
		print("there was an exception - you done f****d up\n")
		return None

	if not url:
		print("no url")
		return None

	## base case
	if url[-4:] == '.txt':# or len(visited_pages) > 2:
		if soup.find('pre'):
			text = soup.find('pre').text
			print("adding {}\n".format(url))
		else:
			ps = soup.find_all('p')
			ps = list(filter(lambda x: 'Artist:' in x.text, ps))
			if len(ps)==1:
				text = ps[0].text
			else:
				text = soup.text

		cursor.execute(ADD_RAW_TEXT, (url, text))
		cnx.commit()
		# return [(url, text)]

	## recursive case
	else:
		# print("reached recursive case")
		lyrics = []
		if soup.find('div', id='leftmain'):
			tag = soup.find('div', id='leftmain')
		else:
			tag = soup
		new_links = tag.find_all('a', href=True)

		for link in new_links:

			# if len(visited_pages) > 30:
			# 	continue

			### check if link can be followed
			# if 'href' not in link.attrs:
			# 	print("href not in attrs")
			# 	continue
			
			# print(link['href'])

			clean_link = util.remove_fragment(link['href'])
			# print("link: {}".format(clean_link))

			if not clean_link:
				print("no clean link")
				continue
			
			abs_link = util.convert_if_relative_url(url, clean_link)

			if abs_link in visited_pages or not util.is_url_ok_to_follow(abs_link, LIMITING_DOMAIN) or 'update' in abs_link:
				print("child link shan't be followed: {}".format(abs_link))
				continue

			get_lyrics(cnx, cursor, abs_link, visited_pages)