Exemplo n.º 1
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
     sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
     for sitemap_page_url in sitemap_page_urls:
         resource = Resource(self._baseUrl + sitemap_page_url, "directory")
         url = resource.get_absolute_url()
         yield Request(url, callback=self.parse_sitemap_page)
Exemplo n.º 2
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
     sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
     for sitemap_page_url in sitemap_page_urls:
         resource = Resource(self._baseUrl + sitemap_page_url, "directory")
         url = resource.get_absolute_url()
         yield Request(url, callback=self.parse_sitemap_page)
Exemplo n.º 3
0
class Fluctu8_com(CrawlSpider):
    allowed_domains = ['fluctu8.com']                       # public for scrapy
    start_urls = [                                          # public for scrapy
                  'http://www.fluctu8.com/sitemap/index-map-0.html',
                  'http://www.fluctu8.com/sitemap/index-map-1.html',
                  'http://www.fluctu8.com/sitemap/index-map-2.html',
                  'http://www.fluctu8.com/sitemap/index-map-3.html',
                  'http://www.fluctu8.com/sitemap/index-map-4.html',
                  'http://www.fluctu8.com/sitemap/index-map-5.html',
                  'http://www.fluctu8.com/sitemap/index-map-6.html',
                  'http://www.fluctu8.com/sitemap/index-map-7.html',
                  'http://www.fluctu8.com/sitemap/index-map-8.html',
                  'http://www.fluctu8.com/sitemap/index-map-9.html']
    
    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()                             # public for scrapy
    feed_list_path = '../' + _url.get_path()                 # public for scrapy

    links = []

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sitemap_page_xpath = "/html/body/a/@href"
        sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
        for sitemap_page_url in sitemap_page_urls:
            resource = Resource(self._baseUrl + sitemap_page_url, "directory")
            url = resource.get_absolute_url()
            yield Request(url, callback=self.parse_sitemap_page)

    def parse_sitemap_page(self, response):
        hxs = HtmlXPathSelector(response)
        podcast_page_xpath = "/html/body/a/@href"
        podcast_page_urls = hxs.select(podcast_page_xpath).extract()
        for podcast_page_url in podcast_page_urls:
            if podcast_page_url.startswith('/'):
                podcast_page_url = self._baseUrl + podcast_page_url
            yield Request(podcast_page_url, callback=self.parse_podcast_page)

    def parse_podcast_page(self, response):
        hxs = HtmlXPathSelector(response)
        podcast_url_xpath = "//table[@class='entry']//tr[1]/td/a[1]/@href"
        podcast_link = hxs.select(podcast_url_xpath).extract()
        try:
            item = PodsearchbotItem()
            item['link'] = podcast_link[1]
        except exceptions.IndexError:
            return
        yield item
Exemplo n.º 4
0
    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""

        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)

        self._rh.ensurePathExists(base_path)

        args = [resource_type, resource_url, resource_target]

        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)

        time_since_last_download = time.time() - self.last_download_timestamp
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources
               ) <= 1000 and time_since_last_download <= 60:  # TODO
            return

        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Exemplo n.º 5
0
class Bitlove_org(CrawlSpider):

    start_urls = ["http://bitlove.org/directory.opml"]  # public for scrapy

    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()  # public for scrapy
    feed_list_path = '../' + _url.get_path()  # public for scrapy

    def parse(self, response):
        d = listparser.parse(response.body)
        feeds = d.feeds
        for feed in feeds:
            item = PodsearchbotItem()
            item['link'] = feed.url
            yield item
Exemplo n.º 6
0
class Podfeed_net(CrawlSpider):
    allowed_domains = ['podfeed.net']  # public for scrapy
    start_urls = ['http://www.podfeed.net/site_map.asp']  # public for scrapy

    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()  # public for scrapy
    feed_list_path = '../' + _url.get_path()  # public for scrapy

    links = []

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
        sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
        for sitemap_page_url in sitemap_page_urls:
            resource = Resource(self._baseUrl + sitemap_page_url, "directory")
            url = resource.get_absolute_url()
            yield Request(url, callback=self.parse_sitemap_page)

    def parse_sitemap_page(self, response):
        hxs = HtmlXPathSelector(response)
        podcast_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
        podcast_page_urls = hxs.select(podcast_page_xpath).extract()
        for podcast_page_url in podcast_page_urls:
            yield Request(podcast_page_url, callback=self.parse_podcast_page)

    def parse_podcast_page(self, response):
        hxs = HtmlXPathSelector(response)
        podcast_url_xpath = "/html/body/div[@class='container']/div[@id='column']/div[@id='podcast']/div[@id='podcast_details']/div[@class='konafilter']/div[@class='pf_box_header right nomobile']/ul[@class='chicklets nomobile']/li[3]/a/@href"

        podcast_link = hxs.select(podcast_url_xpath).extract()
        if not podcast_link:
            return
        if podcast_link[0] == "#":
            return
        item = PodsearchbotItem()
        item['link'] = podcast_link[0]
        yield item
Exemplo n.º 7
0
    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""
        
        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)
        
        self._rh.ensurePathExists(base_path)
        
        args = [resource_type, resource_url, resource_target]
        
        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)
        
        time_since_last_download = time.time() - self.last_download_timestamp 
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO
            return
        
        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Exemplo n.º 8
0
class Digitalpodcast_com(CrawlSpider):

    start_urls = ["http://api.digitalpodcast.com/opml/digitalpodcast.opml"]    # public for scrapy
    
    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()                             # public for scrapy
    feed_list_path = '../' + _url.get_path()                 # public for scrapy

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #podcast_urls_xpath = "/opml/body/outline/outline/@url"
        podcast_urls_xpath = "//outline/outline/@url"
        links = hxs.select(podcast_urls_xpath).extract()
        for link in links:
            if link.startswith('/'):
                link = self._baseUrl + link
            item = PodsearchbotItem()
            item['link'] = link
            yield item
Exemplo n.º 9
0
class Podcast_at(CrawlSpider):

    start_urls = ["http://www.podcast.at/podcasts.html"]  # public for scrapy

    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()  # public for scrapy
    feed_list_path = '../' + _url.get_path()  # public for scrapy

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        next_page_xpath = "/html/body/div[@class='container_20']/div[@class='container_20']/div[@id='middle']/div[@id='podcasts_home']/div[@class='browseteaser_full']/div[@class='inner']/div[@class='page_select']/a[@class='podcast_browse'][5]/@href"
        next_page_urls = hxs.select(next_page_xpath).extract()
        if not next_page_urls: return
        next_page_url = next_page_urls[0]
        if next_page_url.startswith('/'):
            next_page_url = self._baseUrl + next_page_url
        yield Request(next_page_url, callback=self.parse)

        podcast_page_xpath = "/html/body/div[@class='container_20']/div[@class='container_20']/div[@id='middle']/div[@id='podcasts_home']/div[@class='browseteaser_full']/div[@class='inner']/div[@class='podcast_listing_box']/div/div[@class='podcast_listing_content']/a/@href"
        podcast_page_urls = hxs.select(podcast_page_xpath).extract()
        for podcast_page_url in podcast_page_urls:
            if podcast_page_url.startswith('/'):
                podcast_page_url = self._baseUrl + podcast_page_url
            yield Request(podcast_page_url, callback=self.parse_podcast_page)

    def parse_podcast_page(self, response):
        hxs = HtmlXPathSelector(response)
        item = PodsearchbotItem()

        podcast_url_xpath = "/html/body/div[@class='container_20']/div[@id='teasertitle']/div[@class='teasertitle']/a/@href"
        link = hxs.select(podcast_url_xpath).extract()[0]
        if link.startswith('/'):
            link = self._baseUrl + link
        if link.startswith(self._baseUrl + '/podcast_url'):
            try:
                link = self.getContentLocation(link)
            except exceptions.KeyError:
                # broken link
                pass  # return
        item['link'] = link
        yield item

    def getContentLocation(self, link):
        try:
            cacheDir = "../../../cache/httplib2/crawler"
            timeoutSecs = 5
            h = httplib2.Http(cacheDir,
                              timeoutSecs,
                              disable_ssl_certificate_validation=True)
            h.follow_all_redirects = True
            resp = h.request(link, "GET")[0]
            contentLocation = resp['content-location']
        except (exceptions.TypeError, socket.error, socket.timeout,
                httplib.BadStatusLine, httplib2.RelativeURIError,
                httplib2.ServerNotFoundError):
            return link
        return contentLocation
Exemplo n.º 10
0
class Podster_de(CrawlSpider):

    start_urls = ["http://podster.de/tag/system:all"]  # public for scrapy

    _pt = PathTool()

    _url = Resource(start_urls[0], "directory")
    _baseUrl = _url.get_base_url()
    name = _url.get_spider_name()  # public for scrapy
    feed_list_path = '../' + _url.get_path()  # public for scrapy

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        next_page_xpath = "//tr/td[3]/a/@href"
        next_page_urls = hxs.select(next_page_xpath).extract()
        if not next_page_urls: return
        next_page_url = next_page_urls[0]
        yield Request(next_page_url, callback=self.parse)

        podcast_page_xpath = "//table[@class='podcasts']//tr[2]/td[1]/a/@href"
        podcast_page_urls = hxs.select(podcast_page_xpath).extract()
        for podcast_page_url in podcast_page_urls:
            yield Request(podcast_page_url, callback=self.parse_podcast_page)

    def parse_podcast_page(self, response):
        hxs = HtmlXPathSelector(response)
        item = PodsearchbotItem()

        try:
            podcast_url_xpath = "//div[@id='content']//a[5]/@href"
            link = hxs.select(podcast_url_xpath).extract()[0]
            if not link.startswith('/community/map;show=') and \
               not link.startswith('http://podster.de/view/'):
                item['link'] = link
        except IndexError:
            pass
        try:
            podcast_url_xpath = "//div[@id='content']//a[4]/@href"
            link = hxs.select(podcast_url_xpath).extract()[0]
            if not link.startswith('/community/map;show=') and \
               not link.startswith('http://podster.de/view/'):
                item['link'] = link
        except IndexError:
            pass
        try:
            podcast_url_xpath = "//div[@id='content']//div[@class='boxcontent']/a[2]/@href"
            link = hxs.select(podcast_url_xpath).extract()[0]
            if not link.startswith('/community/map;show=') and \
               not link.startswith('http://podster.de/view/'):
                item['link'] = link
        except IndexError:
            pass
        try:
            link = item['link']
        except KeyError:
            print((
                'PodsterDe: WARNING: The page %s did not contain a link to a feed.'
                % response.url))
            return
        yield item
Exemplo n.º 11
0
from mysolr import Solr

from Resource.ResourceHelper import ResourceHelper
from Resource.Resource import Resource
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

solrBase = "http://localhost:8983/solr/"
updateUrl = solrBase + 'update/'

solr = Solr(solrBase)

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:   
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = Resource(feed, 'feed').get_id()
            print(feedDict['id'])
            print("Indexing", feedDict)
            
            solr.update([feedDict], 'json', commit=True)
            print('Indexed.')
    except (xml.parsers.expat.ExpatError, ValueError):
        print(("Failed:", feed))

print("done")