Пример #1
0
    def test_write_lookup_info(self):
        scrape_filename = "./tests/testcases/scrape.csv"
        lookup_filename = "./tests/testcases/lookups.csv"

        f = codecs.open(lookup_filename, encoding='utf-8', mode='w')
        f.truncate()
        f.close()
        output = FileOutput(scrape_filename, lookup_filename)

        podcasts = [Podcast(1, "The Show", "http://theshow.com/feed.xml"),
                    Podcast(2, "Show: Revengance", "http://show-revengance.com/rss?category=podcast"),
                    Podcast(3, "NPR: That Other Show", "https://npr.org/rss.php?podcastid=32"),
        ]

        output.write_lookup_info(podcasts)
        f = codecs.open(output.lookup_filename, encoding='utf-8')
        text = f.read()
        f.close()
        split_rows = text.split("\n")
        read_podcasts = []
        for i in range(3):
            split_text = split_rows[i].split("\t")
            read_podcasts.append(Podcast(int(split_text[0]), split_text[1], split_text[2]))

        self.assertEqual(podcasts[0].itunes_id, read_podcasts[0].itunes_id)
        self.assertEqual(podcasts, read_podcasts)
Пример #2
0
    def test_write_lookup_info(self):
        scrape_filename = "./tests/testcases/scrape.csv"
        lookup_filename = "./tests/testcases/lookups.csv"

        f = codecs.open(lookup_filename, encoding='utf-8', mode='w')
        f.truncate()
        f.close()
        output = FileOutput(scrape_filename, lookup_filename)

        podcasts = [
            Podcast(1, "The Show", "http://theshow.com/feed.xml"),
            Podcast(2, "Show: Revengance",
                    "http://show-revengance.com/rss?category=podcast"),
            Podcast(3, "NPR: That Other Show",
                    "https://npr.org/rss.php?podcastid=32"),
        ]

        output.write_lookup_info(podcasts)
        f = codecs.open(output.lookup_filename, encoding='utf-8')
        text = f.read()
        f.close()
        split_rows = text.split("\n")
        read_podcasts = []
        for i in range(3):
            split_text = split_rows[i].split("\t")
            read_podcasts.append(
                Podcast(int(split_text[0]), split_text[1], split_text[2]))

        self.assertEqual(podcasts[0].itunes_id, read_podcasts[0].itunes_id)
        self.assertEqual(podcasts, read_podcasts)
Пример #3
0
    def test_write_scraped_info(self):
        scrape_filename = "./tests/testcases/scrape.csv"
        lookup_filename = "./tests/testcases/lookups.csv"

        f = codecs.open(scrape_filename, encoding='utf-8', mode='w')
        f.truncate()
        f.close()
        output = FileOutput(scrape_filename, lookup_filename)

        source_url = "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2"
        genre = Url("https://apple.com/podcast/arts", "Arts")
        subgenre = None
        letter = None
        page = None
        urls = []
        for i in range(10):
            urls.append("https://apple.com/podcast/arts/id" + str(i))

        output.write_scraped_info(source_url, genre, subgenre, letter, page,
                                  urls)

        f = codecs.open(output.scrape_filename, encoding='utf-8')
        text = f.read()
        f.close()
        split_text = text.split("\t")
        self.assertEqual(split_text[0], source_url)
        self.assertEqual(split_text[1], genre.string)
        self.assertEqual(split_text[2], "None")
        self.assertEqual(split_text[3], "Popular")
        self.assertEqual(split_text[4], "0")

        self.assertEqual(split_text[5], urls[0])
        self.assertEqual(split_text[6], urls[1])
        self.assertEqual(split_text[7], urls[2])
        self.assertEqual(split_text[8], urls[3])
        self.assertEqual(split_text[9], urls[4])
        self.assertEqual(split_text[10], urls[5])
        self.assertEqual(split_text[11], urls[6])
        self.assertEqual(split_text[12], urls[7])
        self.assertEqual(split_text[13], urls[8])
        self.assertEqual(split_text[14], urls[9] + '\n')
Пример #4
0
    def test_write_scraped_info(self):
        scrape_filename = "./tests/testcases/scrape.csv"
        lookup_filename = "./tests/testcases/lookups.csv"

        f = codecs.open(scrape_filename, encoding='utf-8', mode='w')
        f.truncate()
        f.close()
        output = FileOutput(scrape_filename, lookup_filename)

        source_url = "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2"
        genre = Url("https://apple.com/podcast/arts","Arts")
        subgenre = None
        letter = None
        page = None
        urls = []
        for i in range(10):
            urls.append("https://apple.com/podcast/arts/id" + str(i))

        output.write_scraped_info(source_url, genre, subgenre, letter, page, urls)

        f = codecs.open(output.scrape_filename, encoding='utf-8')
        text = f.read()
        f.close()
        split_text = text.split("\t")
        self.assertEqual(split_text[0], source_url)
        self.assertEqual(split_text[1], genre.string)
        self.assertEqual(split_text[2], "None")
        self.assertEqual(split_text[3], "Popular")
        self.assertEqual(split_text[4], "0")

        self.assertEqual(split_text[5], urls[0])
        self.assertEqual(split_text[6], urls[1])
        self.assertEqual(split_text[7], urls[2])
        self.assertEqual(split_text[8], urls[3])
        self.assertEqual(split_text[9], urls[4])
        self.assertEqual(split_text[10], urls[5])
        self.assertEqual(split_text[11], urls[6])
        self.assertEqual(split_text[12], urls[7])
        self.assertEqual(split_text[13], urls[8])
        self.assertEqual(split_text[14], urls[9] + '\n')
Пример #5
0
#!/usr/bin/python
"""
Example script for how to run the crawler
"""

from podscrape.driver import Driver
from podscrape.fetcher import Fetcher
from podscrape.output import FileOutput

# The url to start scraping from. This is one near the end
# according to Podscrape's strategy.
# the first url would be: "https://itunes.apple.com/us/genre/podcasts-arts/id1301?mt=2"
starting_url = "https://itunes.apple.com/us/genre/podcasts-technology-tech-news/id1448?mt=2&letter=T"

# Instantiate a Fetcher so we can make http requests
fetcher = Fetcher()

# Prepare File Output
scraped_info_file = "./scraped_info.csv"
lookup_info_file = "./lookup_info.csv"
output = FileOutput(scraped_info_file, lookup_info_file)

# Initialize the Driver object
driver = Driver(starting_url, fetcher, output)

# Start crawling!
driver.crawl()