示例#1
0
def test_next_subgenre_music():
    driver = Driver(test_url4, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Music")
    assert_equal(driver.current_subgenre, None)

    subgenre = driver.next_subgenre()
    assert_equal(subgenre, None)
示例#2
0
def test_next_subgenre_music():
    driver = Driver(test_url4, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Music")
    assert_equal(driver.current_subgenre, None)

    subgenre = driver.next_subgenre()
    assert_equal(subgenre, None)
示例#3
0
def test_next_page():
    driver = Driver(test_url2, MockFetcher(fetch_values))
    driver.populate_state()

    assert_equal(driver.next_page().string, "2")
    assert_equal(driver.current_page.string, "2")
    assert_equal(driver.next_page().string, "3")
    assert_equal(driver.current_page.string, "3")
示例#4
0
def test_crawl():
    fetcher = MockSingleResultFetcher(test_url_file)
    driver = Driver(test_url2, fetcher, NullOutput())
    driver.populate_state()
    driver.crawl()

    assert_true(len(driver.history) > 32)

    #Since we're feeding the same page in repeatedly, 
    #it is only fetching urls that are on that page.
    # 16 genres + 6 subgenres + 27 letters
    assert_equal(len(driver.history), 49)
    assert_equal(driver.history[-1].string, "Technology")
示例#5
0
def test_repopulate_queues():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()

    #Shouldn't change anything
    before_pages = driver.pages
    before_letters = driver.letters
    before_subgenres = driver.subgenres

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_pages, driver.pages)
    assert_equal(before_letters, driver.letters)
    assert_equal(before_subgenres, driver.subgenres)
示例#6
0
def test_return_urls_not_in_history_real_tags_single_element():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url3, fetcher)
    driver.populate_state()

    scraper = Scraper(text_from_file(test_url3_file))
    tags = scraper.get_letter_urls()
    #Using two different scrapers because the tags will come from 
    #two different scrapers in real life
    scraper2 = Scraper(text_from_file(test_url3_file))
    tags2 = scraper2.get_letter_urls()

    driver.history = tags2[0]
    new_urls = driver.return_urls_not_in_history(tags)
    assert_equal(new_urls, tags[1:])
示例#7
0
def test_repopulate_queues_subgenres():

    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()

    before_subgenres = driver.subgenres

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_subgenres, driver.subgenres)

    #move two subgenres into history
    driver.history = driver.history + driver.subgenres[0:2]
    after_subgenres = driver.subgenres[2::]

    #wipe driver.subgenres, so that it will actually refresh
    driver.subgenres = []
    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(after_subgenres, driver.subgenres)
示例#8
0
def test_next_subgenre():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre, None)

    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Design")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Fashion & Beauty")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Food")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Literature")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Performing Arts")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Visual Arts")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre, None)
示例#9
0
def test_next_url():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url, fetcher, NullOutput())
    driver.populate_state()
    next_url = driver.next_url()
    assert_equal(next_url, test_url2)
    urls = driver.process_page(fetcher.fetch(next_url), next_url)
    next_url = driver.next_url()
    expected_url = "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2&letter=A&page=2#page"
    assert_equal(next_url, expected_url)
示例#10
0
def test_next_subgenre_from_middle():
    driver = Driver(test_url5, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre.string, "Food")

    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Literature")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Performing Arts")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre.string, "Visual Arts")
    subgenre = driver.next_subgenre()
    assert_equal(subgenre, None)
示例#11
0
def test_repopulate_queues_letters():

    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()

    before_letters = driver.letters

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_letters, driver.letters)

    #Add the current url to history, since it would normally be there
    driver.history.append(driver.current_letter)

    #move two letters into history
    driver.history = driver.history + driver.letters[0:2]
    after_letters = driver.letters[2::]

    #wipe driver.letters, so that it will actually refresh
    driver.letters = []
    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(after_letters, driver.letters)
示例#12
0
def test_next_page():
    driver = Driver(test_url2, MockFetcher(fetch_values))
    driver.populate_state()

    assert_equal(driver.next_page().string, "2")
    assert_equal(driver.current_page.string, "2")
    assert_equal(driver.next_page().string, "3")
    assert_equal(driver.current_page.string, "3")
示例#13
0
def test_repopulate_queues_subgenres():

    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()
    
    before_subgenres = driver.subgenres

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_subgenres, driver.subgenres)

    #move two subgenres into history
    driver.history = driver.history + driver.subgenres[0:2]
    after_subgenres = driver.subgenres[2::]

    #wipe driver.subgenres, so that it will actually refresh
    driver.subgenres = []
    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(after_subgenres, driver.subgenres)
示例#14
0
def test_next_letter():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()

    for let in "ABCDEFGHIJKLMNOPQRSTUVWXYZ#":
        assert_equal(driver.next_letter().string, let)
        assert_equal(driver.current_letter.string, let)

    assert_equal(driver.next_letter(), None)
    assert_equal(driver.current_letter, None)
    #Should NOT loop back to "A", because we haven't repopulated
    #the queue.
    assert_equal(driver.next_letter(), None)
示例#15
0
def test_next_url():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url, fetcher, NullOutput())
    driver.populate_state()
    next_url = driver.next_url()
    assert_equal(next_url, test_url2)
    urls = driver.process_page(fetcher.fetch(next_url), next_url)
    next_url = driver.next_url()
    expected_url = "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2&letter=A&page=2#page"
    assert_equal(next_url, expected_url)
示例#16
0
def test_repopulate_queues_letters():

    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()
    
    before_letters = driver.letters

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_letters, driver.letters)

    #Add the current url to history, since it would normally be there
    driver.history.append(driver.current_letter)

    #move two letters into history
    driver.history = driver.history + driver.letters[0:2]
    after_letters = driver.letters[2::]

    #wipe driver.letters, so that it will actually refresh
    driver.letters = []
    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(after_letters, driver.letters)
示例#17
0
def test_crawl():
    fetcher = MockSingleResultFetcher(test_url_file)
    driver = Driver(test_url2, fetcher, NullOutput())
    driver.populate_state()
    driver.crawl()

    assert_true(len(driver.history) > 32)

    #Since we're feeding the same page in repeatedly,
    #it is only fetching urls that are on that page.
    # 16 genres + 6 subgenres + 27 letters
    assert_equal(len(driver.history), 49)
    assert_equal(driver.history[-1].string, "Technology")
示例#18
0
def test_next_letter():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()

    for let in "ABCDEFGHIJKLMNOPQRSTUVWXYZ#":
        assert_equal(driver.next_letter().string, let)
        assert_equal(driver.current_letter.string, let)

    assert_equal(driver.next_letter(), None)
    assert_equal(driver.current_letter, None)
    #Should NOT loop back to "A", because we haven't repopulated
    #the queue.
    assert_equal(driver.next_letter(), None)
示例#19
0
def test_repopulate_queues():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url2, fetcher)
    driver.populate_state()

    #Shouldn't change anything
    before_pages = driver.pages
    before_letters = driver.letters
    before_subgenres = driver.subgenres

    driver.repopulate_queues(fetcher.fetch(test_url2))
    assert_equal(before_pages, driver.pages)
    assert_equal(before_letters, driver.letters)
    assert_equal(before_subgenres, driver.subgenres)
示例#20
0
def test_return_urls_not_in_history_real_tags_single_element():
    fetcher = MockFetcher(fetch_values)
    driver = Driver(test_url3, fetcher)
    driver.populate_state()

    scraper = Scraper(text_from_file(test_url3_file))
    tags = scraper.get_letter_urls()
    #Using two different scrapers because the tags will come from
    #two different scrapers in real life
    scraper2 = Scraper(text_from_file(test_url3_file))
    tags2 = scraper2.get_letter_urls()

    driver.history = tags2[0]
    new_urls = driver.return_urls_not_in_history(tags)
    assert_equal(new_urls, tags[1:])
示例#21
0
def test_next_page_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    page = driver.next_page()
    assert_equal(page.string, "3")
    assert_equal(driver.current_page.string, "3")
示例#22
0
def test_starting_state():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)

    driver = Driver(test_url2, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter.string, "A")
    assert_equal(driver.current_page.string, "1")

    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Society & Culture")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter.string, "N")
    assert_equal(driver.current_page.string, "2")

    driver = Driver(test_url4, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Music")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)

    driver = Driver(test_url5, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre.string, "Food")
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)
示例#23
0
def test_next_genre():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()
    genre = driver.next_genre()
    assert_equal(genre.string, "Business")
    genre = driver.next_genre()
    assert_equal(genre.string, "Comedy")
    genre = driver.next_genre()
    assert_equal(genre.string, "Education")
    genre = driver.next_genre()
    assert_equal(genre.string, "Games & Hobbies")
    genre = driver.next_genre()
    assert_equal(genre.string, "Government & Organizations")
    genre = driver.next_genre()
    assert_equal(genre.string, "Health")
    genre = driver.next_genre()
    assert_equal(genre.string, "Kids & Family")
    genre = driver.next_genre()
    assert_equal(genre.string, "Music")
    genre = driver.next_genre()
    assert_equal(genre.string, "News & Politics")
    genre = driver.next_genre()
    assert_equal(genre.string, "Religion & Spirituality")
    genre = driver.next_genre()
    assert_equal(genre.string, "Science & Medicine")
    genre = driver.next_genre()
    assert_equal(genre.string, "Society & Culture")
    genre = driver.next_genre()
    assert_equal(genre.string, "Sports & Recreation")
    genre = driver.next_genre()
    assert_equal(genre.string, "TV & Film")
    genre = driver.next_genre()
    assert_equal(genre.string, "Technology")
    genre = driver.next_genre()
    assert_equal(genre, None)
示例#24
0
def test_starting_state():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)

    driver = Driver(test_url2, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter.string, "A")
    assert_equal(driver.current_page.string, "1")

    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Society & Culture")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter.string, "N")
    assert_equal(driver.current_page.string, "2")

    driver = Driver(test_url4, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Music")
    assert_equal(driver.current_subgenre, None)
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)

    driver = Driver(test_url5, MockFetcher(fetch_values))
    driver.populate_state()
    assert_equal(driver.current_genre.string, "Arts")
    assert_equal(driver.current_subgenre.string, "Food")
    assert_equal(driver.current_letter, None)
    assert_equal(driver.current_page, None)
示例#25
0
def test_next_genre():
    driver = Driver(test_url, MockFetcher(fetch_values))
    driver.populate_state()
    genre = driver.next_genre()
    assert_equal(genre.string, "Business")
    genre = driver.next_genre()
    assert_equal(genre.string, "Comedy")
    genre = driver.next_genre()
    assert_equal(genre.string, "Education")
    genre = driver.next_genre()
    assert_equal(genre.string, "Games & Hobbies")
    genre = driver.next_genre()
    assert_equal(genre.string, "Government & Organizations")
    genre = driver.next_genre()
    assert_equal(genre.string, "Health")
    genre = driver.next_genre()
    assert_equal(genre.string, "Kids & Family")
    genre = driver.next_genre()
    assert_equal(genre.string, "Music")
    genre = driver.next_genre()
    assert_equal(genre.string, "News & Politics")
    genre = driver.next_genre()
    assert_equal(genre.string, "Religion & Spirituality")
    genre = driver.next_genre()
    assert_equal(genre.string, "Science & Medicine")
    genre = driver.next_genre()
    assert_equal(genre.string, "Society & Culture")
    genre = driver.next_genre()
    assert_equal(genre.string, "Sports & Recreation")
    genre = driver.next_genre()
    assert_equal(genre.string, "TV & Film")
    genre = driver.next_genre()
    assert_equal(genre.string, "Technology")
    genre = driver.next_genre()
    assert_equal(genre, None)
示例#26
0
def test_next_page_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    page = driver.next_page()
    assert_equal(page.string, "3")
    assert_equal(driver.current_page.string, "3")
示例#27
0
def test_next_genre_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    genre = driver.next_genre()
    assert_equal(genre.string, "Sports & Recreation")
示例#28
0
def test_next_letter_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    letter = driver.next_letter()
    assert_equal(letter.string, "O")
    assert_equal(driver.current_letter.string, "O")
示例#29
0
def test_next_genre_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    genre = driver.next_genre()
    assert_equal(genre.string, "Sports & Recreation")
示例#30
0
#!/usr/bin/python
"""
Example script for how to run the crawler
"""

from podscrape.driver import Driver
from podscrape.fetcher import Fetcher
from podscrape.output import FileOutput

# The url to start scraping from. This is one near the end
# according to Podscrape's strategy.
# the first url would be: "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2"
starting_url = "https://itunes.apple.com/us/genre/podcasts-technology-tech-news/id1448?mt=2&letter=T"

# Instantiate a Fetcher so we can make http requests
fetcher = Fetcher()

# Prepare File Output
scraped_info_file = "./scraped_info.csv"
lookup_info_file = "./lookup_info.csv"
output = FileOutput(scraped_info_file, lookup_info_file)

# Initialize the Driver object
driver = Driver(starting_url, fetcher, output)

# Start crawling!
driver.crawl()
示例#31
0
def test_next_letter_from_middle():
    driver = Driver(test_url3, MockFetcher(fetch_values))
    driver.populate_state()
    letter = driver.next_letter()
    assert_equal(letter.string, "O")
    assert_equal(driver.current_letter.string, "O")