def parse(self, response): hxs = HtmlXPathSelector(response) sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href" sitemap_page_urls = hxs.select(sitemap_page_xpath).extract() for sitemap_page_url in sitemap_page_urls: resource = Resource(self._baseUrl + sitemap_page_url, "directory") url = resource.get_absolute_url() yield Request(url, callback=self.parse_sitemap_page)
class Fluctu8_com(CrawlSpider): allowed_domains = ['fluctu8.com'] # public for scrapy start_urls = [ # public for scrapy 'http://www.fluctu8.com/sitemap/index-map-0.html', 'http://www.fluctu8.com/sitemap/index-map-1.html', 'http://www.fluctu8.com/sitemap/index-map-2.html', 'http://www.fluctu8.com/sitemap/index-map-3.html', 'http://www.fluctu8.com/sitemap/index-map-4.html', 'http://www.fluctu8.com/sitemap/index-map-5.html', 'http://www.fluctu8.com/sitemap/index-map-6.html', 'http://www.fluctu8.com/sitemap/index-map-7.html', 'http://www.fluctu8.com/sitemap/index-map-8.html', 'http://www.fluctu8.com/sitemap/index-map-9.html'] _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy links = [] def parse(self, response): hxs = HtmlXPathSelector(response) sitemap_page_xpath = "/html/body/a/@href" sitemap_page_urls = hxs.select(sitemap_page_xpath).extract() for sitemap_page_url in sitemap_page_urls: resource = Resource(self._baseUrl + sitemap_page_url, "directory") url = resource.get_absolute_url() yield Request(url, callback=self.parse_sitemap_page) def parse_sitemap_page(self, response): hxs = HtmlXPathSelector(response) podcast_page_xpath = "/html/body/a/@href" podcast_page_urls = hxs.select(podcast_page_xpath).extract() for podcast_page_url in podcast_page_urls: if podcast_page_url.startswith('/'): podcast_page_url = self._baseUrl + podcast_page_url yield Request(podcast_page_url, callback=self.parse_podcast_page) def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) podcast_url_xpath = "//table[@class='entry']//tr[1]/td/a[1]/@href" podcast_link = hxs.select(podcast_url_xpath).extract() try: item = PodsearchbotItem() item['link'] = podcast_link[1] except exceptions.IndexError: return yield item
def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources ) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
class Bitlove_org(CrawlSpider): start_urls = ["http://bitlove.org/directory.opml"] # public for scrapy _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy def parse(self, response): d = listparser.parse(response.body) feeds = d.feeds for feed in feeds: item = PodsearchbotItem() item['link'] = feed.url yield item
class Podfeed_net(CrawlSpider): allowed_domains = ['podfeed.net'] # public for scrapy start_urls = ['http://www.podfeed.net/site_map.asp'] # public for scrapy _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy links = [] def parse(self, response): hxs = HtmlXPathSelector(response) sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href" sitemap_page_urls = hxs.select(sitemap_page_xpath).extract() for sitemap_page_url in sitemap_page_urls: resource = Resource(self._baseUrl + sitemap_page_url, "directory") url = resource.get_absolute_url() yield Request(url, callback=self.parse_sitemap_page) def parse_sitemap_page(self, response): hxs = HtmlXPathSelector(response) podcast_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href" podcast_page_urls = hxs.select(podcast_page_xpath).extract() for podcast_page_url in podcast_page_urls: yield Request(podcast_page_url, callback=self.parse_podcast_page) def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) podcast_url_xpath = "/html/body/div[@class='container']/div[@id='column']/div[@id='podcast']/div[@id='podcast_details']/div[@class='konafilter']/div[@class='pf_box_header right nomobile']/ul[@class='chicklets nomobile']/li[3]/a/@href" podcast_link = hxs.select(podcast_url_xpath).extract() if not podcast_link: return if podcast_link[0] == "#": return item = PodsearchbotItem() item['link'] = podcast_link[0] yield item
def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
class Digitalpodcast_com(CrawlSpider): start_urls = ["http://api.digitalpodcast.com/opml/digitalpodcast.opml"] # public for scrapy _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy def parse(self, response): hxs = HtmlXPathSelector(response) #podcast_urls_xpath = "/opml/body/outline/outline/@url" podcast_urls_xpath = "//outline/outline/@url" links = hxs.select(podcast_urls_xpath).extract() for link in links: if link.startswith('/'): link = self._baseUrl + link item = PodsearchbotItem() item['link'] = link yield item
class Podcast_at(CrawlSpider): start_urls = ["http://www.podcast.at/podcasts.html"] # public for scrapy _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy def parse(self, response): hxs = HtmlXPathSelector(response) next_page_xpath = "/html/body/div[@class='container_20']/div[@class='container_20']/div[@id='middle']/div[@id='podcasts_home']/div[@class='browseteaser_full']/div[@class='inner']/div[@class='page_select']/a[@class='podcast_browse'][5]/@href" next_page_urls = hxs.select(next_page_xpath).extract() if not next_page_urls: return next_page_url = next_page_urls[0] if next_page_url.startswith('/'): next_page_url = self._baseUrl + next_page_url yield Request(next_page_url, callback=self.parse) podcast_page_xpath = "/html/body/div[@class='container_20']/div[@class='container_20']/div[@id='middle']/div[@id='podcasts_home']/div[@class='browseteaser_full']/div[@class='inner']/div[@class='podcast_listing_box']/div/div[@class='podcast_listing_content']/a/@href" podcast_page_urls = hxs.select(podcast_page_xpath).extract() for podcast_page_url in podcast_page_urls: if podcast_page_url.startswith('/'): podcast_page_url = self._baseUrl + podcast_page_url yield Request(podcast_page_url, callback=self.parse_podcast_page) def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) item = PodsearchbotItem() podcast_url_xpath = "/html/body/div[@class='container_20']/div[@id='teasertitle']/div[@class='teasertitle']/a/@href" link = hxs.select(podcast_url_xpath).extract()[0] if link.startswith('/'): link = self._baseUrl + link if link.startswith(self._baseUrl + '/podcast_url'): try: link = self.getContentLocation(link) except exceptions.KeyError: # broken link pass # return item['link'] = link yield item def getContentLocation(self, link): try: cacheDir = "../../../cache/httplib2/crawler" timeoutSecs = 5 h = httplib2.Http(cacheDir, timeoutSecs, disable_ssl_certificate_validation=True) h.follow_all_redirects = True resp = h.request(link, "GET")[0] contentLocation = resp['content-location'] except (exceptions.TypeError, socket.error, socket.timeout, httplib.BadStatusLine, httplib2.RelativeURIError, httplib2.ServerNotFoundError): return link return contentLocation
class Podster_de(CrawlSpider): start_urls = ["http://podster.de/tag/system:all"] # public for scrapy _pt = PathTool() _url = Resource(start_urls[0], "directory") _baseUrl = _url.get_base_url() name = _url.get_spider_name() # public for scrapy feed_list_path = '../' + _url.get_path() # public for scrapy def parse(self, response): hxs = HtmlXPathSelector(response) next_page_xpath = "//tr/td[3]/a/@href" next_page_urls = hxs.select(next_page_xpath).extract() if not next_page_urls: return next_page_url = next_page_urls[0] yield Request(next_page_url, callback=self.parse) podcast_page_xpath = "//table[@class='podcasts']//tr[2]/td[1]/a/@href" podcast_page_urls = hxs.select(podcast_page_xpath).extract() for podcast_page_url in podcast_page_urls: yield Request(podcast_page_url, callback=self.parse_podcast_page) def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) item = PodsearchbotItem() try: podcast_url_xpath = "//div[@id='content']//a[5]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: podcast_url_xpath = "//div[@id='content']//a[4]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: podcast_url_xpath = "//div[@id='content']//div[@class='boxcontent']/a[2]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: link = item['link'] except KeyError: print(( 'PodsterDe: WARNING: The page %s did not contain a link to a feed.' % response.url)) return yield item
from mysolr import Solr from Resource.ResourceHelper import ResourceHelper from Resource.Resource import Resource from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory solrBase = "http://localhost:8983/solr/" updateUrl = solrBase + 'update/' solr = Solr(solrBase) _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = Resource(feed, 'feed').get_id() print(feedDict['id']) print("Indexing", feedDict) solr.update([feedDict], 'json', commit=True) print('Indexed.') except (xml.parsers.expat.ExpatError, ValueError): print(("Failed:", feed)) print("done")