Пример #1
0
class FolhaSpider(CrawlSpider):

    name = FOLHA_SPIDER_NAME
    allowed_domains = [FOLHA_DOMAIN]
    start_urls = FOLHA_START_URLS

    rules = [
        Rule(
            SgmlLinkExtractor(allow=(FOLHA_PODER), ),
            callback='parse_item',
            follow=True,
        ),
        Rule(
            SgmlLinkExtractor(allow=(FOLHA_PODEREPOLITICA), ),
            callback='parse_item',
            follow=True,
        ),
    ]

    def parse_item(self, response):

        sel = Selector(response)

        article = ArticleItem()

        article['source'] = 'Folha de S.Paulo'

        article['url'] = response.url

        title = sel.xpath(FOLHA_ARTICLE_TITLE).extract()
        article['title'] = title[0] if title else None

        pub_date = sel.xpath(FOLHA_ARTICLE_PUB_DATE).extract()[0]
        print pub_date, " <<<<<<  aqui"

        article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M")

        content = ' '.join(sel.xpath(FOLHA_ARTICLE_CONTENT).extract())
        article['body'] = content if content else None

        links = sel.xpath('//article//a/@href').extract()
        links = list(set(links))
        try:
            links.remove('javascript:;')
        except Exception:
            pass

        article['links'] = links

        return article
Пример #2
0
    def _compile_rule(self, rule_dict):
        extractor = SgmlLinkExtractor2(allow=rule_dict['allow'], check_url=rule_dict.get('check_url', True))
        rule = Rule(extractor)

        def get_method(method):
            if callable(method):
                return method
            elif isinstance(method, basestring):
                return getattr(self, method, None)
            else:
                return None

        rule.process_links = get_method(rule_dict.get('process_links'))

        #set default link type to leaf
        rule.link_type = rule_dict.get('link_type', '')

        return rule
Пример #3
0
class SggonguoSpider(BaseSpider):
    name = "sggongzuo"
    allowed_domains = ["gongzuo.sg"]
    start_urls = ('http://www.gongzuo.sg', )

    rules = (Rule(
        LinkExtractor(allow='/\?page=[0-1]'),
        callback='parse_item',
        follow=True,
    ), )

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        return self.parse_item_requests_callback(response,
                                                 '//div[@class="summary"]')

    def populate_job_crawler_item(self, detail_item, job_crawler_item):
        try:
            job_crawler_item.job_title = detail_item.xpath(
                './/div[@class="title"]/a[1]/text()').extract()[0]
            job_crawler_item.job_details_link = detail_item.xpath(
                './/div[@class="title"]/a[1]/@href').extract()[0]
            job_crawler_item.job_country = 'Singapore'
            job_crawler_item.job_location = 'Singapore'
            job_crawler_item.publish_date = re.search(
                r'.*([0-9]{4}-[0-9]{2}-[0-9]{2}).*',
                detail_item.xpath('.//div[@class="attr"]/text()[2]').extract()
                [0], re.M).group(1).strip()
            #Convert to the datetime format
            job_crawler_item.publish_date = datetime.datetime.strptime(
                job_crawler_item.publish_date, '%Y-%m-%d'
            ) if job_crawler_item.publish_date is not None else None
            job_crawler_item.salary = detail_item.xpath(
                './/div[@class="attr"]/text()[4]').extract()[0].replace(
                    ',', '').strip()
            job_crawler_item.source = self.name
            job_crawler_item.crawled_date = datetime.datetime.now()

        except Exception as e:
            print e

    def retrieve_job_details(self, response):
        job_crawler_item = response.meta['item']

        try:
            job_crawler_item.job_desc = response.xpath(
                '/html/head/meta[@name="description"]/@content').extract()[0]
            job_crawler_item.contact = response.xpath(
                '//div[@id="article-body"]/div[@class="attr"]/text()[3]'
            ).extract()[0].replace('\n', '').strip()
        except Exception as e:
            print e

        yield job_crawler_item
Пример #4
0
    def _compile_rule(self, rule_dict):
        extractor = SgmlLinkExtractor2(allow=rule_dict['allow'],
                                       check_url=rule_dict.get(
                                           'check_url', True))
        rule = Rule(extractor)

        def get_method(method):
            if callable(method):
                return method
            elif isinstance(method, basestring):
                return getattr(self, method, None)
            else:
                return None

        rule.process_links = get_method(rule_dict.get('process_links'))

        # set default link type to leaf
        rule.link_type = rule_dict.get('link_type', LinkType.LEAF)

        return rule
Пример #5
0
class SingxinSpider(BaseSpider):
    name = "singxin"
    allowed_domains = ["singxin.com"]
    start_urls = ('http://www.singxin.com/category/view/id/47', )

    rules = (Rule(
        LinkExtractor(allow='/category/view/id/47/page/[0-1]'),
        callback='parse_item',
        follow=True,
    ), )

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        return self.parse_item_requests_callback(response,
                                                 '//div[@class="listCell"]')

    def populate_job_crawler_item(self, detail_item, job_crawler_item):
        try:
            job_crawler_item.job_title = detail_item.xpath(
                './/a[@class="title"]/text()').extract()[0]
            job_crawler_item.job_details_link = 'http://www.singxin.com' + detail_item.re(
                r'<a.*href="(/info/view/id/[0-9]+)">.*</a>')[0]
            job_crawler_item.job_country = 'Singapore'
            job_crawler_item.job_location = 'Singapore'
            job_crawler_item.contact = detail_item.re(
                r'<a.*href="tel:(.*)">.*</a>')[0]
            job_crawler_item.source = self.name
            job_crawler_item.crawled_date = datetime.datetime.now()

        except Exception as e:
            print e

    def retrieve_job_details(self, response):
        job_crawler_item = response.meta['item']

        try:
            job_crawler_item.job_desc = response.xpath(
                '/html/head/meta[@name="description"]/@content').extract()[0]

            job_crawler_item.publish_date = response.selector.re(
                '<td><i class="icon-calendar icon-small"></i>(.*)</td>'
            )[0].replace(' ', '')

            #Convert to the datetime format
            job_crawler_item.publish_date = datetime.datetime.strptime(
                job_crawler_item.publish_date, '%Y-%m-%d'
            ) if job_crawler_item.publish_date is not None else None
        except Exception as e:
            print e

        yield job_crawler_item
Пример #6
0
class ShichengBBSSpider(BaseSpider):
    name = "shichengbbs"
    allowed_domains = ["shichengbbs.com"]
    start_urls = ('http://www.shichengbbs.com/category/view/id/47', )

    rules = (Rule(
        LinkExtractor(allow='/category/view/id/47/page/[0-2]'),
        callback='parse_item',
        follow=True,
    ), )

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        return self.parse_item_requests_callback(
            response, '//div[@class="listCell row-fluid"]')

    def populate_job_crawler_item(self, detail_item, job_crawler_item):

        try:
            job_crawler_item.job_title = detail_item.xpath(
                './div[1]/a/text()').extract()[0]
            job_crawler_item.job_details_link = 'http://www.shichengbbs.com' + \
                                                   detail_item.re(r'<a.*href="(/info/view/id/[0-9]+)">.*</a>')[0]
            job_crawler_item.publish_date = \
            detail_item.re(r'(.*)<span.*</span> <i class="icon-phone-sign icon-small"></i>')[0].replace('\t', '')
            # Convert to the datetime format
            job_crawler_item.publish_date = self.derieve_date_from_short_date_string(
                job_crawler_item.publish_date
            ) if job_crawler_item.publish_date is not None else None
            job_crawler_item.job_country = 'Singapore'
            job_crawler_item.job_location = 'Singapore'
            job_crawler_item.contact = detail_item.xpath(
                './div[2]/a/text()').extract()[0]
            job_crawler_item.source = self.name
            job_crawler_item.crawled_date = datetime.datetime.now()
        except:
            pass

    def retrieve_job_details(self, response):
        job_crawler_item = response.meta['item']

        try:
            job_crawler_item.job_desc = \
                response.xpath('/html/head/meta[@name="description"]/@content').extract()[0]
        except:
            pass

        yield job_crawler_item
Пример #7
0
class OGLOBOSpider(CrawlSpider):

    name = OGLOBO_SPIDER_NAME
    allowed_domains = [OGLOBO_DOMAIN]
    start_urls = OGLOBO_START_URLS

    rules = [
        Rule(
            SgmlLinkExtractor(allow=(OGLOBO_URL_PATTERN), ),
            callback='parse_item',
            follow=True,
        ),
    ]

    def parse_item(self, response):

        sel = Selector(response)

        article = ArticleItem()

        article['source'] = 'O Globo'

        article['url'] = response.url

        title = sel.xpath(OGLOBO_ARTICLE_TITLE).extract()
        article['title'] = title[0] if title else None

        pub_date = sel.xpath(OGLOBO_ARTICLE_PUB_DATE).extract()
        pub_date = pub_date[0].replace('T', ' ')

        article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M")

        content = ' '.join(sel.xpath(OGLOBO_ARTICLE_CONTENT).extract())
        article['body'] = content if content else None

        links = sel.xpath('//article//a/@href').extract()
        links = list(set(links))
        try:
            links.remove('javascript:;')
        except Exception:
            pass
        article['links'] = links

        return article
Пример #8
0
class restaurantSpider(CrawlSpider):
    name = "restaurantSpider"
    allowed_domains = ["www.yelp.com",]
    # start_urls = [
    #               "http://www.yelp.com/search?find_desc=chinese+restaurant&find_loc=San+Francisco%2C+CA&ns=1&start=0&sortby=rating&l=g:-122.530517578,37.6859939294,-122.325897217,37.8488325065",
    #              ]
    start_urls = [line for line in open("yelpCrawler/seeds/restaurant.txt")]
    rules = [
            Rule(SgmlLinkExtractor(allow=(pattern, ), restrict_xpaths=('//ul[@class="pagination-links"]')), follow=True, callback='parse_restaurant'),
#               Rule(SgmlLinkExtractor(allow=(r"/search\?.*start=\d+")),  follow=True, callback='parse_restaurant'),
            ]
    
    def parse_restaurant(self, response):
        items = []
        print(response.url)
        sel = Selector(response)
        
        result_list = sel.css('div.search-results-content ul.ylist.ylist-bordered.search-results div.natural-search-result')
        
        # add this to retry another proxy
        if not result_list:
            log.msg("Retrying with " + response.url, level=log.INFO)
            yield Request(url=response.url, dont_filter=True)
        else:
            log.msg("Crawled "+response.url, level=log.INFO)
            for element in result_list:
                item = YelpcrawlerItem()
                item['name'] = clear_html_tag(trim_and_join(element.css('h3.search-result-title a.biz-name').extract()))
    #             print(type(element.css('h3.search-result-title a.biz-name').xpath('text()').extract()))
                item['rating'] = trim_and_join(element.css('div.rating-large i.star-img').xpath('@title').extract()).split()[0]
                item['review_count'] = trim_and_join(element.css('span.review-count').xpath('text()').extract()).split()[0]
    #             print(type(element.css('span.review-count').xpath('text()').extract()))
                item['price_range'] = trim_and_join(element.css('div.price-category span.business-attribute.price-range').xpath('text()').extract()).count("$")
                location = element.css('div.secondary-attributes address').xpath('text()').extract()
                item['phone'] = trim_and_join(element.css('div.secondary-attributes span.biz-phone').xpath('text()').extract())
                parse_location(location, item)
    #             print(type(element.css('div.secondary-attributes address').xpath('text()').extract()))
                
    #             items.append(item)
                yield item
Пример #9
0
class TestSpider(DailyDealSpider):
    country = 'us'
    name = 'test_spider'
    main_domain = 'dailysteals.com'
    allowed_domains = [main_domain]
    main_url = 'http://www.dailysteals.com/'

    rules = (
        # Deals
        Rule(SgmlLinkExtractor(
            restrict_xpaths='//h4[contains(@class,"product-title")]',
            process_value=lambda url: ensure_protocol(url)),
             callback='get_item',
             follow=False), )

    has_main_offer = False
    decimal_mark = DECIMAL_MARK_PERIOD

    price_currency_xpath = '//div[@itemprop="price"]//text()'

    extractors = {
        F_ID: Extractor(xpath='//a[contains(@class,"btn-cart")]/@data-itemid'),
        F_OFFER: Extractor(xpath='//h1[@itemprop="name"]//text()'),
        F_DISCOUNT: Extractor(xpath='//dl[@class="discount"]//text()'),
        F_SOLD:
        Extractor(xpath='//div[contains(@class, "coupons-bought")]/text()'),
        F_DESC: Extractor(xpath='//div[@class="merchant-content"]//text()'),
        F_CITY: Extractor(xpath='//div[@class="zone"]//text()'),
        F_M_NAME:
        Extractor(xpath='//div[@class="side-merchant"]/span/b/text()'),
        F_M_WEBSITE: Extractor(xpath='//div[@class="side-merchant"]//a/@href'),
        F_M_ADDRESS: Extractor(xpath='//div[@class="adress-info"]//text()'),
        # F_M_LAT: Extractor(xpath='//div[@class="adress-info"]/img/@src',
        #                    fn=lambda matches, r, s:
        #                    matches[0].split('%7C')[1].split(',')[0]),
        # F_M_LON: Extractor(xpath='//div[@class="adress-info"]/img/@src',
        #                    fn=lambda matches, r, s:
        #                    matches[0].split('%7C')[1].split(',')[1])
    }
Пример #10
0
class SaveMoneyIndia(CrawlSpider):
    name = "savemoney"
    allowed_domains = ["savemoneyindia.com"]
    categories = {
        "http://www.savemoneyindia.com/category/computers-laptops/":
        "Computer & Laptops",
        "http://www.savemoneyindia.com/category/mobiles/":
        "Mobiles",
        "http://www.savemoneyindia.com/category/mobile-dth-data-card-recharge/":
        "Recharge",
        "http://www.savemoneyindia.com/category/clothing-shoes-bags-lifestyle/":
        "Clothing",
        "http://www.savemoneyindia.com/category/footwear/":
        "Footwear",
        "http://www.savemoneyindia.com/category/electronics-gadgets/":
        "Electronics",
    }
    start_urls = categories.keys()

    rules = (Rule(SgmlLinkExtractor(
        allow=(r'www.savemoneyindia.com\/category.*\/page\/\d+\/', )),
                  callback='parse_start_url',
                  follow=True), )

    def parse_start_url(self, response):
        items = []
        for selection in response.xpath("//div[contains(@id, 'post-')]"):
            item = ContentItem()
            item['title'] = selection.xpath(
                "h2[@class='entry-title']/a/text()").extract()
            item['link'] = selection.xpath(
                "div[@class='entry']//a/@href").extract()
            item['desc'] = selection.xpath(
                "div[@class='entry']/p/text()").extract()
            item["category"] = key_lookup(self.categories, response.url)
            items.append(item)

        return items
Пример #11
0
class SaveMoneyIndia(CrawlSpider):
    name = "freekamall"
    allowed_domains = ["freekamall.com"]

    start_urls = ["http://freekaamaal.com/"]

    rules = (Rule(SgmlLinkExtractor(allow=(r'freekaamaal.*\/page\/\d+\/', )),
                  callback='parse_start_url',
                  follow=True), )

    def parse_start_url(self, response):
        items = []
        for selection in response.xpath("//div[@class='contentpagination']"):
            item = ContentItem()
            item['title'] = selection.xpath(
                "div[@class='exp_detail']/a/h2/text()").extract()
            item['link'] = selection.xpath("a[@class='shpnw']/@href").extract()
            item['desc'] = selection.xpath(
                "div[@class='exp_detail']/span/text()").extract()
            item["category"] = ""
            items.append(item)

        return items
Пример #12
0
class HLSpider(CrawlSpider):
    name = "hl"
    allowed_domains = ["hl.co.uk"]
    start_urls = [
        "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?is150=true"
        #         "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results"
    ]

    rules = [
        Rule(
            SgmlLinkExtractor(allow=[
                '/funds/fund-discounts,-prices--and--factsheets/search-results/'
            ],
                              deny=['charts$', 'invest$', 'tab=']),
            'parse_fund')
    ]

    def parse_fund(self, response):
        x = HtmlXPathSelector(response)

        fund = HlscraperItem()
        fund['Url'] = response.url
        fund['Name'] = x.select(
            "normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())"
        ).extract()
        fund['ExdividendDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())"
        ).extract()
        fund['PaymentDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())"
        ).extract()
        fund['RunningYield'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())"
        ).extract()
        fund['HistoricYield'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())"
        ).extract()
        fund['IncomePaid'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())"
        ).extract()
        fund['TypeOfPayment'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())"
        ).extract()
        fund['LaunchDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())"
        ).extract()
        fund['Sector'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())"
        ).extract()
        fund['FundSize'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())"
        ).extract()
        fund['NumberOfHoldings'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())"
        ).extract()
        fund['TypeOfUnits'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())"
        ).extract()
        fund['FundType'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())"
        ).extract()
        fund['NetInitialCharge'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())"
        ).extract()
        fund['NetAnnualCharge'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())"
        ).extract()
        fund['OtherExpenses'] = x.select(
            "normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())"
        ).extract()
        fund['PerformanceFee'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())"
        ).extract()
        fund['PlatformFee'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())"
        ).extract()

        fund['Wealth150'] = x.select(
            "/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src"
        ).extract()

        return fund
Пример #13
0
class WarAlbumSpider(CrawlSpider):
    checker = MongoChecker()
    name = 'war'
    description_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/div[2]/div[1]/text()'
    description_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/div[2]/div[1]/text()'
    image_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/div[2]/div[2]/div/a/img/@data-src_big'
    image_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/div[2]/div[2]/div/a/img/@data-src_big'
    post_link_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/a/@name'
    post_link_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/a/@name'
    page_name = 'page{0}.html'
    post_link_prefix = 'http://vk.com/waralbum?w=wall-'
    album_path = 'album'
    photo_name = 'photo{0}.jpg'
    allowed_domains = ['vk.com']
    start_urls = ['https://m.vk.com/waralbum']
    rules = [
        Rule(
            SgmlLinkExtractor(restrict_xpaths=('//a[@class="show_more"]')),
            callback='parse_public',
            follow=True,
        )
    ]
    counter_pages = 1
    counter_posts = 0

    def parse_start_url(self, response):
        hxs = Selector(response)
        # self.save_page(response.body)
        return self.parse_posts(5, hxs, self.description_xpath0,
                                self.image_xpath0, self.post_link_xpath0)

    def parse_public(self, response):
        hxs = Selector(response)
        # self.save_page(response.body)
        self.counter_pages += 1
        return self.parse_posts(10, hxs, self.description_xpath,
                                self.image_xpath, self.post_link_xpath)

    def parse_posts(self, amount, selector, description_xpath, image_xpath,
                    post_link_xpath):
        posts = []
        for i in range(1, amount + 1):
            descr = selector.xpath(description_xpath.format(i)).extract()
            image_tmp_url = selector.xpath(image_xpath.format(i)).extract()
            description = ''
            if len(descr) > 0:
                description = descr[0]
            image_urls = []
            for img in image_tmp_url:
                image_urls.append(img.split('|')[0])
            if len(description) == 0 or len(image_urls) == 0:
                break
            post_link = self.post_link_prefix + selector.xpath(
                post_link_xpath.format(i)).extract()[0].split('-')[1]
            if self.checker.check(post_link):
                raise CloseSpider('Shutdown. New posts: {0}'.format(
                    self.counter_posts))
            local_images = []
            for url in image_urls:
                photo_file = self.photo_name.format(uuid.uuid4())
                urllib.urlretrieve(url, self.album_path + '/' + photo_file)
                local_images.append(photo_file)
            post = WaralbumPost()
            post['img_links'] = image_urls
            post['description'] = description
            post['post_link'] = post_link
            post['local_images'] = local_images
            posts.append(post)
            self.counter_posts += 1
            print description
            print image_urls
            print post_link
        return posts

    def save_page(self, content):
        with open(self.page_name.format(self.counter_pages), 'wb') as f:
            f.write(content)
Пример #14
0
class LivingSocialSpider(CrawlSpider):
    name = "boxOffice"
    allowed_domains = ["boxofficeindia.com"]
    login_page = "http://www.boxofficeindia.com/Years/years_detail/2012"
    start_urls = [
        "http://www.boxofficeindia.com/Years/years_detail/2012"
        #login_page
    ]

    rules = (

        #Rule(SgmlLinkExtractor(allow=('/Boxoffice',)),follow=True),
        #Rule(SgmlLinkExtractor(allow=('/Years/years_detail/2014',))),
        Rule(SgmlLinkExtractor(allow=('movie_detail')),
             callback='myparse',
             follow=True), )

    mov_fields = {
        'title': './/div[@class="title4"]/a/text()',
        'rel_date': '//div[@id="detailed"]//span/ul/li[1]//td[2]/b/text()',
        'genre': '//div[@id="detailed"]//span/ul/li[2]//td[2]/a/b/text()',
        'run_time': '//div[@id="detailed"]//span/ul/li[3]//td[2]/b/text()',
        'budget': '//div[@id="detailed"]//span/ul/li[4]//td[2]/b/text()',
        'screens': '//div[@id="detailed"]//span/ul/li[5]//td[2]/b/text()',
        'footfalls': '//div[@id="detailed"]//span/ul/li[6]//td[2]/b/text()',
        'dis_share': '//div[@id="detailed"]//span/ul/li[7]//td[2]/b/text()',
        'total_gross': '//div[@id="detailed"]//span/ul/li[8]//td[2]/b/text()',
        'total_nett_gross':
        '//div[@id="detailed"]//span/ul/li[9]//td[2]/b/text()',
        #'link':'.//div[@class="details"]//td/b/text()'
        'link': './/div[@id="detailed"]//span/ul/li[9]//td[2]/b/text()'
    }

    def start_requests(self):
        return self.init_request()

    def init_request(self):
        print 'init_request'
        return [Request(url=self.login_page, callback=self.login)]

    def login(self, response):
        print 'login'
        return FormRequest.from_response(
            response,
            formnumber=1,
            formdata={
                'loginUname': '*****@*****.**',
                'loginUpass': '******'
            },
            callback=self.check_login_response)

    def check_login_response(self, response):
        print "login response"
        if "Logout" in response.body:
            print "bitchP0Lease"
            for url in self.start_urls:
                yield self.make_requests_from_url(url)
            #return Request("http://www.boxofficeindia.com/Years/years_detail/2014",callback=self.Red)

        else:
            self.file = open('dump2.html', 'wb')
            self.file.write(response.body)

            print "bitchPLease"
            return

# def parse(self,response):
#    sel = HtmlXPathSelector(response)
#   l = sel.select('//div[@class="images"]/a')
#  for i in l:
#     j = i.select('.//a')

#        for iter in j:
#           self.myparse(Request(iter.select(

    def myparse(self, response):
        print "myParse"
        selector = HtmlXPathSelector(response)
        # l = selector.select(self.deals_list_xpath)
        l = selector.select('//div[@id="detailed"]')
        ll = l.select('.//div[@class="title4"]/a/text()').extract()
        open(ll[0].strip() + '.html', 'wb').write(response.body)
        print ll[0].strip()
        for deal in l:

            #loader = XPathItemLoader(LivingSocialDeal(),selector=deal)
            loader = XPathItemLoader(MoviesClass(), selector=deal)
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.default_output_processor = TakeFirst()

            for field, xpath in self.mov_fields.iteritems():
                loader.add_xpath(field, xpath)
                x = deal.select(field).extract()
            yield loader.load_item()
Пример #15
0
class SgxinSpider(BaseSpider):
    name = "sgxin"
    allowed_domains = ["sgxin.com"]
    start_urls = (
        'http://www.sgxin.com/viewcat_job1.html',
        'http://www.sgxin.com/viewcat_job2.html',
        'http://www.sgxin.com/viewcat_job3.html',
        'http://www.sgxin.com/viewcat_job4.html',
        'http://www.sgxin.com/viewcat_job5.html',
        'http://www.sgxin.com/viewcat_job6.html',
        'http://www.sgxin.com/viewcat_job7.html',
        'http://www.sgxin.com/viewcat_job8.html',
        'http://www.sgxin.com/viewcat_job9.html',
        'http://www.sgxin.com/viewcat_job10.html',
    )

    rules = (Rule(
        LinkExtractor(allow='index\.php\?ct=job.*&md=browse&page=[0-1]&'),
        callback='parse_item'), )

    def parse_start_url(self, response):
        return self.parse_item(response)

    def parse_item(self, response):
        requests = []
        for job_item in response.xpath('//tr'):
            job_crawler_item = JobItem()
            for index, detail_item in enumerate(job_item.xpath('./td')):
                self.populate_job_crawler_item(index, detail_item,
                                               job_crawler_item)
                if index == 4:
                    if self.should_load_details(job_crawler_item):
                        requests.append(
                            Request(url=job_crawler_item.job_details_link,
                                    callback=self.retrieve_job_details,
                                    meta={'item': job_crawler_item},
                                    dont_filter=True))

        return requests

    def populate_job_crawler_item(self, index, detail_item, job_crawler_item):

        if index == 0:
            self.populate_job_title(detail_item, job_crawler_item)
        elif index == 1:
            self.populate_salary(detail_item, job_crawler_item)
        elif index == 2:
            self.populate_employer_name(detail_item, job_crawler_item)
        elif index == 3:
            self.populate_job_location(detail_item, job_crawler_item)
        elif index == 4:
            self.populate_publish_date(detail_item, job_crawler_item)
        else:
            pass

        self.populate_job_country(detail_item, job_crawler_item)

        job_crawler_item.source = self.name
        job_crawler_item.crawled_date = datetime.datetime.now()

    def populate_job_title(self, detail_item, job_crawler_item):

        job_crawler_item.job_title = detail_item.re(r'<a.*>(.*)</a>')[0]
        job_crawler_item.job_details_link = 'http://www.sgxin.com/' + detail_item.re(
            r'<a.*href="(.*)">.*</a>')[0]

    def populate_salary(self, detail_item, job_crawler_item):
        job_crawler_item.salary = detail_item.xpath('./text()').extract()[0]

    def populate_employer_name(self, detail_item, job_crawler_item):
        job_crawler_item.employer_name = detail_item.xpath(
            './text()').extract()[0]

    def populate_job_location(self, detail_item, job_crawler_item):
        job_crawler_item.job_location = detail_item.xpath(
            './text()').extract()[0]

    def populate_job_country(self, detail_item, job_crawler_item):
        job_crawler_item.job_country = 'Singapore'

    def populate_publish_date(self, detail_item, job_crawler_item):
        job_crawler_item.publish_date = detail_item.xpath(
            './text()').extract()[0]
        # Convert to the datetime format
        # job_crawler_item.publish_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y') + '-' + job_crawler_item.publish_date, '%Y-%m-%d') if job_crawler_item.publish_date is not None else None

        job_crawler_item.publish_date = self.derieve_date_from_short_date_string(
            job_crawler_item.publish_date
        ) if job_crawler_item.publish_date is not None else None

    def retrieve_job_details(self, response):
        job_crawler_item = response.meta['item']

        try:
            job_crawler_item.job_desc = \
                response.xpath('//blockquote/p').extract()[0][3:-4].replace('<br>', '\n').replace('<br/>', '\n') #to strip the <p></p>

            job_crawler_item.contact = response.xpath(
                '//*[@id="content"]/div/div[2]/span[9]/text()').extract()[0]
        except:
            pass

        yield job_crawler_item