Exemplo n.º 1
0
class KetoSizeMe(spiders.CrawlSpider):
    name = 'keto-size-me'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketosizeme.com']
    start_urls = ['https://ketosizeme.com/category/ketogenic-diet-recipes/']

    rules = [

        # Extract links for finding additional pages within recipe index,
        # e.g. https://ketosizeme.com/category/ketogenic-diet-recipes/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'https://ketosizeme.com/category/ketogenic-diet-recipes/page/\d+/'
            )),

        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://ketosizeme.com/.+/$', restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 2
0
class RuledMeSpider(spiders.CrawlSpider):
    name = 'ruled-me'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ruled.me']
    start_urls = ['https://www.ruled.me/keto-recipes/']

    rules = [
        # Extract links for food category pages,
        # e.g. https://www.ruled.me/keto-recipes/breakfast/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/$',
                restrict_xpaths='//div[@class="r-list"]')),

        # Extract links for finding additional pages within food category pages,
        # e.g. https://www.ruled.me/keto-recipes/dinner/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(allow=(
                r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/page/\d+/'))),

        # Extract links for the actual recipes,
        # e.g. https://www.ruled.me/easy-keto-cordon-bleu/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.ruled.me/\w+(\-\w+)*/$',
            restrict_xpaths='//div[@id="content"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 3
0
class QueenBs(spiders.CrawlSpider):
    name = 'queen-bs'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['queenbsincredibleedibles.com']
    start_urls = ['http://queenbsincredibleedibles.com/category/keto/page/1/']

    rules = [
        # Extract links for finding additional keto recipe pages,
        # e.g. http://queenbsincredibleedibles.com/category/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'http://queenbsincredibleedibles.com/category/keto/page/\d+/')
        ),

        # Extract links for recipes,
        # e.g. http://queenbsincredibleedibles.com/creamy-coconut-kale-sausage-soup/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'http://queenbsincredibleedibles.com/.*/$',
            deny=r'(category\/)|(ive-fallen-in-love-with-keto)'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 4
0
class KetogasmSpider(spiders.CrawlSpider):
    name = 'ketogasm'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketogasm.com']
    _url_format = ('https://ketogasm.com/recipe-index/?'
                   'fwp_recipes_filters=recipe&'
                   'fwp_paged=%d')
    start_urls = [
        (_url_format % 1),
        (_url_format % 2),
        (_url_format % 3),
        (_url_format % 4),
    ]

    rules = [
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://ketogasm.com/.*/$',
            restrict_xpaths='//div[@id="recipes-grid"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        self.rules = (
            spiders.Rule(SameBaseDomainLinkExtractor(allowed_domains=self.allowed_domains), callback=self._parse_contents, follow=True),
        )
        logging.getLogger('scrapy.core.engine').setLevel(logging.INFO)
        logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.INFO)
        logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(logging.INFO)

        # We must set up self.rules before calling super, since super calls _compile_rules().
        super(AllStudiosScraper, self).__init__(*args, **kwargs)
Exemplo n.º 6
0
class SitemapSpider(spiders.CrawlSpider):
    name = 'sitemap'
    old_releases = tuple([
        "/%s" % old_release for old_release in [
            'austin', 'bexar', 'cactus', 'diablo', 'essex', 'folsom',
            'grizzly', 'havana', 'icehouse', 'juno', 'kilo', 'liberty',
            'mitaka'
        ]
    ])

    rules = [
        spiders.Rule(LinkExtractor(
            allow=[
                r'.*\.html',
                r'.*\.pdf',
                r'.*\.xml',
                r'.*\.txt',
                r'.*/',
            ],
            deny=[r'/trunk/', r'/draft/', r'/api/', r'/juno/', r'/icehouse/']),
                     follow=True,
                     callback='parse_item')
    ]

    def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
        super(SitemapSpider, self).__init__(*args, **kwargs)
        self.domain = domain
        self.allowed_domains = [domain]
        self.start_urls = ['http://%s' % domain]
        for url in urls.split(','):
            if not url:
                continue
            self.start_urls.append(url)

    def parse_item(self, response):
        item = SitemapItem()
        item['loc'] = response.url

        path = urlparse.urlsplit(response.url).path
        if path.startswith(self.old_releases):
            # weekly changefrequency and lower priority for old files
            item['priority'] = '0.5'
            item['changefreq'] = 'weekly'
        else:
            # daily changefrequency and highest priority for current files
            item['priority'] = '1.0'
            item['changefreq'] = 'daily'

        if 'Last-Modified' in response.headers:
            timestamp = response.headers['Last-Modified']
        else:
            timestamp = response.headers['Date']
        lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
        item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
        return item
Exemplo n.º 7
0
class Ketovale(spiders.CrawlSpider):
    name = 'ketovale'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketovale.com']
    start_urls = ['https://www.ketovale.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.ketovale.com/category/recipes/page/3/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.ketovale.com/category/recipes/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.ketovale.com/recipe/.*/$',
            restrict_xpaths='//h2[@class="entry-title"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 8
0
class SugarFreeMom(spiders.CrawlSpider):
    name = 'sugar-free-mom'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['sugarfreemom.com']
    start_urls = ['https://www.sugarfreemom.com/recipes/category/diet/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.sugarfreemom.com/recipes/category/diet/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(allow=(
                r'sugarfreemom.com/recipes/category/diet/keto/page/\d+/'))),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'sugarfreemom.com/recipes/[^\/]+/$',
            restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 9
0
class GreekGoesKetoSpider(spiders.CrawlSpider):
    name = 'greek-goes-keto'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['greekgoesketo.com']
    start_urls = ['https://www.greekgoesketo.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.greekgoesketo.com/category/recipes/page/1/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=
                r'https://(.+\.)greekgoesketo.com/category/recipes/page/\d+/')
        ),
        # Extract links for recipes,
        spiders.Rule(linkextractors.LinkExtractor(restrict_css='main article'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 10
0
class HeyKetoMamaSpider(spiders.CrawlSpider):
    name = 'hey-keto-mama'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['heyketomama.com']
    start_urls = ['https://www.heyketomama.com/category/recipes/page/1/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.heyketomama.com/category/recipes/page/6/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://www.heyketomama.com/category/recipes/page/\d+/'
            )),
        # Extract links for recipes,
        # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/
        spiders.Rule(linkextractors.LinkExtractor(
            restrict_xpaths='//div[@class="entry-content"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 11
0
class WholesomeYum(spiders.CrawlSpider):
    name = 'wholesome-yum'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['wholesomeyum.com']
    start_urls = ['https://www.wholesomeyum.com/tag/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.wholesomeyum.com/tag/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'wholesomeyum.com/tag/keto/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(allow=[
            r'wholesomeyum.com/[^\/]+/$', r'wholesomeyum.com/recipes/[^\/]+/$'
        ],
                                                  restrict_xpaths='//main'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 12
0
class LowCarbYum(spiders.CrawlSpider):
    name = 'low-carb-yum'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['lowcarbyum.com']
    start_urls = ['https://lowcarbyum.com/recipes/']

    rules = [
        # Extract links for food category pages,
        # e.g. https://lowcarbyum.com/category/desserts/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://lowcarbyum.com/category/',
                deny=r'https://lowcarbyum.com/category/((reviews)|(articles))')
        ),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://lowcarbyum.com/.+/$',
            restrict_xpaths='//header[@class="entry-header"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 13
0
class GreekGoesKetoSpider(spiders.CrawlSpider):
    name = 'greek-goes-keto'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['greekgoesketo.com']
    start_urls = ['https://greekgoesketo.com/category/recipes/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://greekgoesketo.com/category/recipes/page/1/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'https://greekgoesketo.com/category/recipes/page/\d+/')
        ),
        # Extract links for recipes,
        # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://greekgoesketo.com/\d{4}/\d{2}/\d{2}/.+/',
            restrict_xpaths='//div[@class="content-block"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 14
0
class YourFriendsJ(spiders.CrawlSpider):
    name = 'your-friends-j'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['yourfriendsj.com']
    start_urls = ['http://yourfriendsj.com/recipe-library/']

    rules = [

        # Extract links for finding additional recipe pages,
        # e.g. http://yourfriendsj.com/tag/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'yourfriendsj.com/recipe-library/\?paged=\d+')),
        # Extract links for recipes,
        # e.g. http://yourfriendsj.com/recipes/easy-guacamole-recipe/
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'http://yourfriendsj.com/recipes/[^\/]*/$',
            restrict_xpaths='//article'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 15
0
class SkinnyTaste(spiders.CrawlSpider):
    name = 'skinny-taste'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['skinnytaste.com']
    start_urls = ['https://www.skinnytaste.com/recipes/keto/']

    rules = [
        # Extract links for finding additional recipe pages,
        # e.g. https://www.skinnytaste.com/recipes/keto/page/2/
        spiders.Rule(
            linkextractors.LinkExtractor(
                allow=r'skinnytaste.com/recipes/keto/page/\d+/')),
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            allow=[
                r'skinnytaste.com/[^\/]+/$',
            ],
            restrict_xpaths='//div[@class="archives"]'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 16
0
class StoreSpider(ss.CrawlSpider):
    name = "store"
    start_urls = [
        'https://www.microsoft.com/en-in/store/top-free/apps/pc',
        'https://www.microsoft.com/en-in/store/top-free/games/mobile',
        'https://www.microsoft.com/en-in/store/top-free/games/pc',
        'https://www.microsoft.com/en-in/store/top-free/games/xbox',
        'https://www.microsoft.com/en-in/store/top-free/apps/mobile'
    ]
    rules = (ss.Rule(LinkExtractor(
        allow=(),
        deny=(".*-1"),
        restrict_xpaths=("//a[contains(@aria-label,'next page')]")),
                     callback='parse_item',
                     follow=True), )
    custom_settings = {
        'ITEM_PIPELINES': {
            'appstore.pipelines.AppstorePipeline': 300
        }
    }

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        #print(response.url)
        selected = Selector(response=response).xpath(
            '//div[contains(@class, "c-group f-wrap-items context-list-page")]'
        )
        sections = selected.xpath(
            "//section[contains(@class,'m-product-placement-item f-size-medium context-app')]"
        )
        # print(len(sections))
        for section in sections:
            soup = BeautifulSoup(section.extract(), 'html.parser')
            try:
                item = AppstoreItem()
                item['name'] = soup.h3.text
                item['rating'] = soup.find('span', {
                    'itemprop': 'ratingValue'
                }).text
                item['url'] = urllib.parse.urljoin(response.url,
                                                   soup.find('a')['href'])
                yield item
            except:
                pass
Exemplo n.º 17
0
class KetovangelistKitchen(spiders.CrawlSpider):
    name = 'ketovangelist-kitchen'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketovangelistkitchen.com']
    # Organize start URLs in descending order of category strength (e.g. muffins
    # should be categorized as "snack" not "eggs".
    start_urls = [
        'http://www.ketovangelistkitchen.com/indexes/recipes/appetizers/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/desserts/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/beverages/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/sides/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/snack/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/soup/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/sauces-dressings/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/casseroles/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/fat-bombs/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/dairy-free/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/kid-friendly/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/baked-goods/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/beef/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/chicken-turkey/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/chocolate/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/fish/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/pork/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/vegetables/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/nuts/',
        'http://www.ketovangelistkitchen.com/indexes/recipes/eggs/',
    ]

    rules = [
        # Extract links for recipes.
        spiders.Rule(linkextractors.LinkExtractor(
            restrict_xpaths='//div[@class="entry-content"]'),
                     callback=callback_handler.process_callback,
                     follow=False)
    ]
Exemplo n.º 18
0
class DietDoctorSpider(spiders.CrawlSpider):
    name = 'diet-doctor'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['dietdoctor.com']

    # TODO(mtlynch): Make this more flexible. It's now limited to only 40 pages
    # but it should just figure out which ones are present. I've adding Rules
    # for the Previous/Next links but they don't seem to work.
    _url_prefix = ('https://www.dietdoctor.com/low-carb/recipes'
                   '?s=&st=recipe&lowcarb%5B%5D=keto&sp=')
    start_urls = [_url_prefix + str(i) for i in range(1, 40)]

    rules = [
        # Extract links for recipes,
        # e.g. /recipes/green-onion-no-chile-chicken-enchiladas
        spiders.Rule(linkextractors.LinkExtractor(
            allow=r'https://www.dietdoctor.com/recipes/'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 19
0
class KetoConnectSpider(spiders.CrawlSpider):
    name = 'ketoconnect'

    callback_handler = CallbackHandler(
        content_saver=persist.ContentSaver(_get_download_root()))

    allowed_domains = ['ketoconnect.net']
    start_urls = [
        'https://www.ketoconnect.net/main-dishes/',
        'https://www.ketoconnect.net/side-dishes/',
        'https://www.ketoconnect.net/breakfasts/',
        'https://www.ketoconnect.net/snacks/',
        'https://www.ketoconnect.net/desserts/',
        'https://www.ketoconnect.net/beverages/'
    ]

    rules = [
        # Extract links for the actual recipes
        # e.g. https://www.ketoconnect.net/recipe/spicy-cilantro-dressing/
        spiders.Rule(linkextractors.LinkExtractor(restrict_xpaths='//article'),
                     callback=callback_handler.process_callback,
                     follow=False),
    ]
Exemplo n.º 20
0
class CrawlJobSpider(sp.CrawlSpider):
    name = "Crawl_Job"

    allow_domains = ['https://jobs.51job.com']
    start_urls = ['https://jobs.51job.com/all/']

    rules = (
        sp.Rule(LinkExtractor(allow=(r'https://jobs.51job.com/all/p\d+'))),
        sp.Rule(LinkExtractor(allow=(r'https://jobs.51job.com/.*/\d+.*',)), callback='parse_Item'),
    )

    def parse_Item(self, response):
        Job_item = CrawljobItem()
        Job_item['url'] = response.url
        Job_item['job_name'] = re.sub(r"\(职位编号.*\)",'',response.css("div.cn h1::attr(title)").extract()[0])

        money = response.css("div.cn strong::text").extract()
        Job_item['salary'] = 0.0
        Job_item['Low_salary'] = 0.0
        Job_item['High_salary'] = 0.0
        Job_item['average_salary'] = 0.0
        factormo = 1.0
        factordate = 1.0

        if money:
            Job_item['salary'] = money
            text = money[0]
            if "千" in text:
                factormo = 1000.0
                text = text.replace("千", "")
            elif "万" in text:
                factormo = 10000.0
                text = text.replace("万", "")
            elif "元" in text:
                factormo = 1.0
                text = text.replace("元", "")

            if "月" in text:
                factordate = 1.0
                text = text.replace("月", "")
            elif "年" in text:
                factordate = 1.0 / 12.0
                text = text.replace("年", "")
            elif "天" in text:
                factordate = 31.0
                text = text.replace("天", "")
            elif "小时" in text:
                factordate = 8.0 * 31.0
                text = text.replace("小时", "")

            text = text.replace("/","")
            mo = text.split('-')

            if len(mo) == 2:
                a = float(mo[0]) * factormo * factordate
                b = float(mo[1]) * factormo * factordate
                Job_item['Low_salary'] = a
                Job_item['High_salary'] = b
                Job_item['average_salary'] = (a + b) / 2.0
            elif len(mo) == 1:
                a = float(mo[0]) * factormo * factordate
                Job_item['Low_salary'] = a
                Job_item['High_salary'] = a
                Job_item['average_salary'] = a

        info = response.css("div.cn p[class='msg ltype']::text").extract()
        numlist = len(info)

        company_address = info[0].replace('\n','').replace('\r','').replace('\t','').replace('\xa0','')
        Job_item['company_address'] = company_address.split("-")[0]

        Job_item['work_experience'] = info[1].replace('\xa0', '')

        Job_item['work_language'] = ''

        if numlist == 5:
            Job_item['education'] = info[2].replace('\xa0', '')
            Job_item['need_numbers'] = info[3].replace('\xa0', '')
            Job_item['release_time'] = info[4].replace('\xa0', '').replace('\t', '').replace("发布", '')
            Job_item['work_language'] = '普通话精通'
        elif numlist ==6 or numlist == 7:
            Job_item['education'] = info[2].replace('\xa0', '')
            Job_item['need_numbers'] = info[3].replace('\xa0', '')
            Job_item['release_time'] = info[4].replace('\xa0', '').replace('\t', '').replace("发布", '')
            Job_item['work_language'] = info[5].replace('\xa0', '').replace('\t', '')
        elif numlist == 4:
            Job_item['education'] = 'None'
            Job_item['need_numbers'] = info[2].replace('\xa0', '')
            Job_item['release_time'] = info[3].replace('\xa0', '').replace('\t','').replace("发布", '')
            Job_item['work_language'] = '普通话精通'
        else:
            print(numlist)

        company_name = response.css("div.com_msg > a > p::text").extract()
        if company_name is not None:
            Job_item['company_name'] = company_name

        company_type = response.css("div:nth-child(1) > div.com_tag > p:nth-child(1)::text").extract()
        company_size = response.css("div:nth-child(1) > div.com_tag > p:nth-child(2)::text").extract()
        company_business = response.css("div:nth-child(1) > div.com_tag > p:nth-child(3)::attr(title)").extract()
        if company_type:
            Job_item['company_type'] = company_type
        else:
            Job_item['company_type'] = "None"

        if company_size:
            Job_item['company_size'] = company_size
        else:
            Job_item['company_size'] = "None"

        if company_business:
            Job_item['company_business'] = company_business
        else:
            Job_item['company_business'] = "None"

        job_detail = response.css("div.tCompany_main > div:nth-child(1)").\
             xpath("string(div)").extract()[0].replace('\t','').replace('\n','').replace('\r',' ')

        if job_detail:
            Job_item['job_detail'] = job_detail
        else:
            Job_item['job_detail'] = "None"


        job_catacategory =  response.css("div.tCompany_main > div:nth-child(1) > div > div.mt10").xpath("string(p)").extract()[0].\
             replace('\t','').replace('\n','').replace("职能类别:","").replace("\r"," ")

        if job_catacategory:
            Job_item['job_catacategory'] = job_catacategory
        else:
            Job_item['job_catacategory'] = "None"

        company_detail = response.css("div.tCompany_main > div:last-child").xpath("string(div)").extract()[0].replace('\t','').replace('\n','')
        if company_detail:
            Job_item['company_detail'] = company_detail
        else:
            Job_item['company_detail'] = "None"

        return Job_item
Exemplo n.º 21
0
class CrunchbaseSpider(spiders.CrawlSpider):
    name = "crunchbase"

    # TODO: find out if pages with 416 status code is re-crawled or not!
    # handle_httpstatus_list = [416]

    def start_requests(self):
        urls = []

        with open('urls.txt', 'rb') as urls_file:
            # Change encoding if necessary
            urls = [
                line.strip()
                for line in urls_file.read().decode('utf16').splitlines()
                if line.strip()
            ]

        for url in urls:
            yield self.make_requests_from_url(url)

    rules = (
        # Crawl and parse person
        spiders.Rule(LinkExtractor(allow=r'/person/.*',
                                   deny=r'/person/.*[/\.]'),
                     callback='parse_person',
                     follow=True),
        # Crawl organization
        spiders.Rule(LinkExtractor(allow=r'/organization/.*',
                                   deny=r'/organization/.*[/\.]'),
                     callback='parse_organization',
                     follow=True),
        # Crawl acquisitions table
        spiders.Rule(LinkExtractor(allow=r'/acquisitions$',
                                   deny=r'/app/search',
                                   restrict_css='.acquisitions'),
                     callback='parse_acquisitions'),
        # Crawl employees
        spiders.Rule(LinkExtractor(allow=r'/people$',
                                   deny=r'/app/search',
                                   restrict_css='.people'),
                     callback='parse_employees'),
        # Crawl competitors
        spiders.Rule(LinkExtractor(allow=r'/competitors$',
                                   restrict_css='.competitors'),
                     callback='parse_competitors'),
        # Crawl partners
        spiders.Rule(LinkExtractor(allow=r'/partners$',
                                   restrict_css='.partners'),
                     callback='parse_partners'),
        # Crawl advisors
        spiders.Rule(LinkExtractor(allow=r'/advisors$',
                                   restrict_css='.advisors'),
                     callback='parse_advisors'),
    )

    def parse_start_url(self, response):
        if response.url.find('/person/') >= 0:
            self.parse_person(response)
        elif response.url.find('/organization/') >= 0:
            self.parse_organization(response)
        else:
            raise Exception('Start url is neither person nor organization')

    """
    NOTE: there might be field-specific processors under scraper/items.py
    """

    def parse_person(self, response):
        loader = ItemLoader(item=Person(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'primary_role',
            '//*[@id="info-card-overview-content"]/div/dl/div/dd')

        # Fields expected: born, gender, location, website
        overview = response.xpath(
            '//*[@id="info-card-overview-content"]/div/dl/dt/text()')
        overview_loader = loader.nested_xpath(
            '//*[@id="info-card-overview-content"]/div/dl')
        for i in range(len(overview)):
            key = overview[i].extract()
            key = key[:key.find(':')].lower()
            try:
                overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1))
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')
        loader.add_xpath('description', '//*[@id="description"]/span/div')
        loader.add_css('current_jobs', '.current_job')
        loader.add_css('past_jobs', '.past_job')
        loader.nested_css('.advisory_roles').add_xpath('board_advisors',
                                                       './/ul/li')
        loader.nested_css('table.investors').add_xpath(
            'investments', './/tr[not(@class="thead")]')
        loader.nested_css('.education').add_xpath('education', './/ul/li')

        return loader.load_item()

    def parse_organization(self, response):
        loader = ItemLoader(item=Organization(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        # loader.add_value('ipo_stock', None) # TODO!

        # TODO: supposed to get person url for founders!
        # Fields expected: headquarters, description, founders, categories,
        # website, founded (date), and aliases
        keys = response.css('div.definition-list').xpath('dt/text()')
        values = response.css('div.definition-list').xpath('dd')
        for i in range(len(keys)):
            key = keys[i].extract()
            key = key[:key.find(':')].lower()
            try:
                loader.add_value(key, values[i].extract())
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')

        yield loader.load_item()

        for item in self.parse_acquisitions(response):
            yield item
        for item in self.parse_employees(response):
            yield item
        for item in self.parse_competitors(response):
            yield item
        for item in self.parse_partners(response):
            yield item
        for item in self.parse_advisors(response):
            yield item

    def parse_acquisitions(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        acq_selectors = response.css('div.acquisitions').xpath(
            './/tr[not(th)]')

        for sel in acq_selectors:
            loader = ItemLoader(item=Acquisition(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()

            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('date', 'td[1]/text()')
            loader.add_xpath('acquired_url', 'td[2]/a/@href')
            yield loader.load_item()

    def parse_employees(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        employee_selector = response.css('div.people').xpath('.//ul/li')

        for sel in employee_selector:
            loader = ItemLoader(item=Employee(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('company_url', company_url)
            loader.add_xpath('person_url', './/h4/a/@href')
            loader.add_xpath('title', './/h5/text()')
            yield loader.load_item()

    def parse_competitors(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        comp_selectors = response.css('div.competitors').xpath(
            './/ul/li//h4/a')

        for sel in comp_selectors:
            loader = ItemLoader(item=Competitor(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('competitor_url', '@href')
            yield loader.load_item()

    def parse_partners(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        partner_selectors = response.css('div.partners').xpath(
            './/ul/li//h4/a')

        for sel in partner_selectors:
            loader = ItemLoader(item=Partner(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('partner_url', '@href')
            yield loader.load_item()

    def parse_advisors(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        employee_selector = response.css('div.advisors').xpath('.//ul/li')

        for sel in employee_selector:
            loader = ItemLoader(item=BoardMember(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('company_url', company_url)
            loader.add_xpath('person_url', './/h4/a/@href')
            loader.add_xpath('title', './/h5/text()')
            yield loader.load_item()
Exemplo n.º 22
0
class SitemapSpider(spiders.CrawlSpider):
    name = 'sitemap'

    MAINT_SERIES = ['newton', 'ocata', 'pike']
    MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/')
    LATEST_PAT = re.compile('^/latest/')

    rules = [
        spiders.Rule(LinkExtractor(allow=[
            r'.*\.html',
            r'.*\.pdf',
            r'.*\.xml',
            r'.*\.txt',
            r'.*/',
        ],
                                   deny=[
                                       r'/trunk/', r'/draft/', r'/austin/',
                                       r'/bexar/', r'/cactus/', r'/diablo/',
                                       r'/essex/', r'/folsom/', r'/grizzly/',
                                       r'/havana/', r'/icehouse/', r'/juno/',
                                       r'/kilo/', r'/liberty/', r'/mitaka/'
                                   ]),
                     follow=True,
                     callback='parse_item')
    ]

    def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs):
        super(SitemapSpider, self).__init__(*args, **kwargs)
        self.domain = domain
        self.allowed_domains = [domain]
        self.start_urls = ['http://%s' % domain]
        for url in urls.split(','):
            if not url:
                continue
            self.start_urls.append(url)

    def parse_item(self, response):
        item = SitemapItem()
        item['loc'] = response.url

        path = urlparse.urlsplit(response.url).path

        if self.MAINT_RELEASES_PAT.match(path):
            # weekly changefrequency and highest prio for maintained release
            item['priority'] = '1.0'
            item['changefreq'] = 'weekly'
        elif self.LATEST_PAT.match(path):
            # daily changefrequency and normal priority for current files
            item['priority'] = '0.5'
            item['changefreq'] = 'daily'
        else:
            # These are unversioned documents
            # daily changefrequency and highest priority for current files
            item['priority'] = '1.0'
            item['changefreq'] = 'daily'

        if 'Last-Modified' in response.headers:
            timestamp = response.headers['Last-Modified']
        else:
            timestamp = response.headers['Date']
        lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z")
        item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod)
        return item