Exemplo n.º 1
0
class RatingSpider(Spider):
    name = "ratings"
    allowed_domains = ["livelib.ru"]

    def __init__(self, **kwargs):
        super(RatingSpider, self).__init__(**kwargs)
        self.mongo = Mongo()

    def start_requests(self):
        for book in self.mongo.books_collection().find():
            readers_url = u"https://www.livelib.ru/book/{}/readers/read".format(book["lib_id"])
            self.logger.info(u"Queuing page: {}".format(readers_url))
            readers_request = Request(readers_url, callback=self.parse)
            readers_request.meta["book_lib_id"] = book["lib_id"]
            yield readers_request

    def parse(self, response):
        self.logger.info(u"Parsing page: {0}".format(response.url))

        book_lib_id = response.meta["book_lib_id"]
        for rating_sel in response.xpath("//div[p/span[@class='rating']]"):
            stars = rating_sel.xpath(".//span[@class='rating']/span/@class").extract_first()[1:2]
            if stars != "0":
                rating = RatingItem()
                rating["book_lib_id"] = book_lib_id
                rating["user_lib_id"] = rating_sel.xpath(".//a[@class='action']/@title").extract_first()
                rating["rating"] = stars
                yield rating                

        for href in response.xpath("//a[contains(@id, 'a-list-page-next')]/@href").extract():
            url = response.urljoin(href)
            self.logger.info(u"Queuing page: {0}".format(url))
            readers_request = Request(url, callback=self.parse)
            readers_request.meta["book_lib_id"] = book_lib_id
            yield readers_request
Exemplo n.º 2
0
class UserSpider(Spider):
    name = "users"
    allowed_domains = ["livelib.ru"]

    def __init__(self, **kwargs):
        super(UserSpider, self).__init__(**kwargs)
        self.mongo = Mongo()

    def start_requests(self):
        for rating in self.mongo.ratings_collection().find():
            if self.mongo.users_collection().find({"user_lib_id": rating["user_lib_id"]}).count() == 0:
                user_url = u"https://www.livelib.ru/reader/{0}".format(rating["user_lib_id"])
                self.logger.info(u"Queuing page: {}".format(user_url))
                user_request = Request(user_url, callback=self.parse)
                user_request.meta["user_lib_id"] = rating["user_lib_id"]
                yield user_request

    def parse(self, response):
        self.logger.info(u"Parsing page: {0}".format(response.url))

        user = UserItem()
        user["user_lib_id"] = response.meta["user_lib_id"]
        profile_info = response.xpath("//div[@class='profile-info-column']")

        gender = profile_info.xpath(u".//span[@class='block-info'][contains(b/text(), 'Пол:')]/text()").extract_first()
        if gender:
            gender = gender.strip()
            if gender == u'женский':
                user["gender"] = "female"
            elif gender == u'мужской':
                user["gender"] = "male"

        birth_date = profile_info.xpath(u".//span[@class='block-info'][contains(b/text(), 'Дата рождения:')]/text("
                                        u")").extract_first()
        if birth_date:
            user["birth_date"] = birth_date.strip()

        yield user
Exemplo n.º 3
0
 def __init__(self, **kwargs):
     super(RatingSpider, self).__init__(**kwargs)
     self.mongo = Mongo()