def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response and 'Did you mean:' not in response and
                'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None
    def get_settings(self, entity_id):
        '''Gets book's setting data'''
        if self._page_source is None:
            return

        settings = self._page_source.xpath('//div[@id="bookDataBox"]/div[@class="infoBoxRowItem"]/a[contains(@href, "/places/")]')
        settings_data = {}
        for setting in settings:
            if '/places/' not in setting.get('href'):
                continue
            label = setting.text
            resp = open_url(self._connection, setting.get('href'))
            if not resp:
                continue
            setting_page = html.fromstring(resp)
            if setting_page is None:
                continue
            desc = setting_page.xpath('//div[@class="mainContentContainer "]/div[@class="mainContent"]/div[@class="mainContentFloat"]/div[@class="leftContainer"]/span/text()')
            if len(desc) > 0 and re.sub(r'\s+', ' ', desc[0]).strip():
                desc = unicode(re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1'))
            else:
                desc = u'No description found on Goodreads.'
            settings_data[entity_id] = {'label': unicode(label.decode('utf-8').encode('latin-1')),
                                        'description': desc,
                                        'aliases': []}
            entity_id += 1

        return settings_data
    def _read_secondary_author_pages(self, author_info):
        '''Reads secondary authors' page and gets their bios, image urls, and images encoded into base64'''
        if len(author_info) < 2:
            return

        for author in author_info[1:]:
            author['page'] = html.fromstring(open_url(self._connection, author['url']))
            author['bio'] = self._get_author_bio(author['page'])
            author['image_url'] = self._get_author_image(author['page'])
    def _read_secondary_author_pages(self, author_info):
        '''Reads secondary authors' page and gets their bios, image urls, and images encoded into base64'''
        if len(author_info) < 2:
            return

        for author in author_info[1:]:
            author['page'] = html.fromstring(
                open_url(self._connection, author['url']))
            author['bio'] = self._get_author_bio(author['page'])
            author['image_url'] = self._get_author_image(author['page'])
Пример #5
0
    def get_characters(self, entity_id):
        '''Gets book's character data'''
        if self._page_source is None:
            return

        characters = self._page_source.xpath(
            '//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a'
        )
        character_data = {}
        for char in characters:
            if '/characters/' not in char.get('href'):
                continue
            resp = open_url(self._connection, char.get('href'))

            if not resp:
                continue

            char_page = html.fromstring(resp)
            if char_page is None:
                continue

            desc = char_page.xpath(
                '//div[@class="workCharacterAboutClear"]/text()')
            if desc and re.sub(r'\s+', ' ', desc[0]).strip():
                desc = str(
                    re.sub(r'\s+', ' ',
                           desc[0]).strip().decode('utf-8').encode('latin-1'))
            else:
                desc = 'No description found on Goodreads.'
            alias_list = char_page.xpath(
                '//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()'
            )
            alias_list = [
                re.sub(r'\s+', ' ', x).strip() for aliases in alias_list
                for x in aliases.split(',') if re.sub(r'\s+', ' ', x).strip()
            ]
            character_data[entity_id] = {
                'label': str(char.text.decode('utf-8').encode('latin-1')),
                'description': desc,
                'aliases': alias_list
            }
            entity_id += 1

        if prefs['expand_aliases']:
            characters = {}
            for char, char_data in list(character_data.items()):
                characters[char] = [char_data['label']] + char_data['aliases']

            expanded_aliases = auto_expand_aliases(characters)
            for alias, ent_id in list(expanded_aliases.items()):
                character_data[ent_id]['aliases'].append(alias)

        return character_data
    def __init__(self, url, connection, asin):
        self._connection = connection
        self._asin = asin

        book_id_search = BOOK_ID_PAT.search(url)
        self._goodreads_book_id = book_id_search.group(1) if book_id_search else None

        response = open_url(self._connection, url)
        self._page_source = None
        if not response:
            return
        self._page_source = html.fromstring(response)

        self._author_recommendations = None
        self._author_other_books = []
Пример #7
0
    def __init__(self, url, connection, asin):
        self._connection = connection
        self._asin = asin

        book_id_search = BOOK_ID_PAT.search(url)
        self._goodreads_book_id = book_id_search.group(1) if book_id_search else None

        response = open_url(self._connection, url)
        self._page_source = None
        if not response:
            return
        self._page_source = html.fromstring(response)

        self._author_recommendations = None
        self._author_other_books = []
    def _parse_tooltip_info(self, book_data, book_id, image_url):
        '''Takes information retried from goodreads tooltips link and parses it'''
        title = book_data.xpath('//a[contains(@class, "readable")]')
        title = title[0].text if len(title) > 0 else None
        authors = book_data.xpath('//a[contains(@class, "authorName")]')
        authors = [authors[0].text] if len(authors) > 0 else None
        rating_info = book_data.xpath('//div[@class="bookRatingAndPublishing"]/span[@class="minirating"]')
        if len(rating_info) > 0:
            rating_string = rating_info[0].text_content().strip().replace(',', '').split()
            rating = float(rating_string[rating_string.index('avg')-1])
            num_of_reviews = int(rating_string[-2])
        else:
            rating = None
            num_of_reviews = None

        try:
            asin_elements = book_data.xpath('//a[contains(@class, "kindlePreviewButtonIcon")]/@href')
            book_asin = urlparse.parse_qs(urlparse.urlsplit(asin_elements[0]).query)["asin"][0]
        except (KeyError, IndexError):
            book_asin = None

        # We should get the ASIN from the tooltips file, but just in case we'll
        # keep this as a fallback (though this only works in some regions - just USA?)
        if not book_asin:
            asin_data_page = open_url(self._connection, '/buttons/glide/' + book_id)
            book_asin = GOODREADS_ASIN_PAT.search(asin_data_page)
            if not book_asin:
                return None
            book_asin = book_asin.group(1)

        desc = book_data.xpath('//div[@class="addBookTipDescription"]//span[not(contains(@id, "freeTextContainer"))]')
        desc_backup = book_data.xpath('//div[@class="addBookTipDescription"]//span[contains(@id, "freeTextContainer")]')
        if len(desc) > 0:
            desc = re.sub(r'\s+', ' ', desc[0].text).strip()
        elif len(desc_backup) > 0:
            desc = re.sub(r'\s+', ' ', desc_backup[0].text).strip()
        else:
            return None

        return {'class': 'featuredRecommendation',
                'asin': book_asin,
                'title': title,
                'authors': authors,
                'imageUrl': image_url,
                'description': desc,
                'hasSample': False,
                'amazonRating': rating,
                'numberOfReviews': num_of_reviews}
Пример #9
0
    def _parse_tooltip_info(self, book_data, book_id, image_url):
        '''Takes information retried from goodreads tooltips link and parses it'''
        title = book_data.xpath('//a[contains(@class, "readable")]')
        title = title[0].text if len(title) > 0 else None
        authors = book_data.xpath('//a[contains(@class, "authorName")]')
        authors = [authors[0].text] if len(authors) > 0 else None
        rating_info = book_data.xpath('//div[@class="bookRatingAndPublishing"]/span[@class="minirating"]')
        if len(rating_info) > 0:
            rating_string = rating_info[0].text_content().strip().replace(',', '').split()
            rating = float(rating_string[rating_string.index('avg')-1])
            num_of_reviews = int(rating_string[-2])
        else:
            rating = None
            num_of_reviews = None

        try:
            asin_elements = book_data.xpath('//a[contains(@class, "kindlePreviewButtonIcon")]/@href')
            book_asin = urlparse.parse_qs(urlparse.urlsplit(asin_elements[0]).query)["asin"][0]
        except (KeyError, IndexError):
            book_asin = None

        # We should get the ASIN from the tooltips file, but just in case we'll
        # keep this as a fallback (though this only works in some regions - just USA?)
        if not book_asin:
            asin_data_page = open_url(self._connection, '/buttons/glide/' + book_id)
            book_asin = GOODREADS_ASIN_PAT.search(asin_data_page)
            if not book_asin:
                return None
            book_asin = book_asin.group(1)

        desc = book_data.xpath('//div[@class="addBookTipDescription"]//span[not(contains(@id, "freeTextContainer"))]')
        desc_backup = book_data.xpath('//div[@class="addBookTipDescription"]//span[contains(@id, "freeTextContainer")]')
        if len(desc) > 0:
            desc = re.sub(r'\s+', ' ', desc[0].text).strip()
        elif len(desc_backup) > 0:
            desc = re.sub(r'\s+', ' ', desc_backup[0].text).strip()
        else:
            return None

        return {'class': 'featuredRecommendation',
                'asin': book_asin,
                'title': title,
                'authors': authors,
                'imageUrl': image_url,
                'description': desc,
                'hasSample': False,
                'amazonRating': rating,
                'numberOfReviews': num_of_reviews}
Пример #10
0
    def search_for_asin_on_goodreads(self, url):
        '''Searches for ASIN of book at given url'''
        book_id_search = BOOK_ID_PAT.search(url)
        if not book_id_search:
            return None

        book_id = book_id_search.group(1)

        try:
            response = open_url(self._connections['goodreads'], '/buttons/glide/' + book_id)
        except PageDoesNotExist:
            return None

        book_asin_search = GOODREADS_ASIN_PAT.search(response)
        if not book_asin_search:
            return None

        return book_asin_search.group(1)
Пример #11
0
    def search_for_goodreads_url(self, keywords):
        '''Searches for book's goodreads url using given keywords'''
        query = urlencode({'q': keywords})
        try:
            response = open_url(self._connections['goodreads'], '/search?' + query)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if 'No results' in response:
            return None

        urlsearch = GOODREADS_URL_PAT.search(response)
        if not urlsearch:
            return None

        # return the full URL with the query parameters removed
        url = 'https://www.goodreads.com' + urlsearch.group(1)
        return urlparse.urlparse(url)._replace(query=None).geturl()
    def get_characters(self, entity_id):
        '''Gets book's character data'''
        if self._page_source is None:
            return

        characters = self._page_source.xpath('//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a')
        character_data = {}
        for char in characters:
            if '/characters/' not in char.get('href'):
                continue
            resp = open_url(self._connection, char.get('href'))

            if not resp:
                continue

            char_page = html.fromstring(resp)
            if char_page is None:
                continue

            desc = char_page.xpath('//div[@class="workCharacterAboutClear"]/text()')
            if desc and re.sub(r'\s+', ' ', desc[0]).strip():
                desc = unicode(re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1'))
            else:
                desc = u'No description found on Goodreads.'
            alias_list = char_page.xpath('//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()')
            alias_list = [re.sub(r'\s+', ' ', x).strip() for aliases in alias_list for x in aliases.split(',')
                          if re.sub(r'\s+', ' ', x).strip()]
            character_data[entity_id] = {'label': unicode(char.text.decode('utf-8').encode('latin-1')),
                                         'description': desc,
                                         'aliases': alias_list}
            entity_id += 1

        if prefs['expand_aliases']:
            characters = {}
            for char, char_data in character_data.items():
                characters[char] = [char_data['label']] + char_data['aliases']

            expanded_aliases = auto_expand_aliases(characters)
            for alias, ent_id in expanded_aliases.items():
                character_data[ent_id]['aliases'].append(alias)

        return character_data
Пример #13
0
    def _get_book_info_from_tooltips(self, book_info):
        '''Gets books ASIN, title, authors, image url, description, and rating information'''
        if isinstance(book_info, tuple):
            book_info = [book_info]
        books_data = []
        link_pattern = 'resources[Book.{0}][type]=Book&resources[Book.{0}][id]={0}'
        tooltips_page_url = '/tooltips?' + "&".join([link_pattern.format(book_id) for book_id, image_url in book_info])
        tooltips_page_info = json.loads(open_url(self._connection, tooltips_page_url))['tooltips']

        for book_id, image_url in book_info:
            book_data = tooltips_page_info['Book.{0}'.format(book_id)]
            if not book_data:
                continue
            book_data = html.fromstring(book_data)
            parsed_data = self._parse_tooltip_info(book_data, book_id, image_url)
            if not parsed_data:
                continue
            books_data.append(parsed_data)

        return books_data
    def _get_book_info_from_tooltips(self, book_info):
        '''Gets books ASIN, title, authors, image url, description, and rating information'''
        if isinstance(book_info, tuple):
            book_info = [book_info]
        books_data = []
        link_pattern = 'resources[Book.{0}][type]=Book&resources[Book.{0}][id]={0}'
        tooltips_page_url = '/tooltips?' + "&".join([link_pattern.format(book_id) for book_id, image_url in book_info])
        tooltips_page_info = json.loads(open_url(self._connection, tooltips_page_url))['tooltips']

        for book_id, image_url in book_info:
            book_data = tooltips_page_info['Book.{0}'.format(book_id)]
            if not book_data:
                continue
            book_data = html.fromstring(book_data)
            parsed_data = self._parse_tooltip_info(book_data, book_id, image_url)
            if not parsed_data:
                continue
            books_data.append(parsed_data)

        return books_data
    def _get_quotes(self):
        '''Gets book's quote data'''
        if self._page_source is None:
            return

        quotes_page = self._page_source.xpath('//a[@class="actionLink" and contains(., "More quotes")]')
        quotes = []
        if len(quotes_page) > 0:
            resp = open_url(self._connection, quotes_page[0].get('href'))
            if not resp:
                return
            quotes_page = html.fromstring(resp)
            if quotes_page is None:
                return
            for quote in quotes_page.xpath('//div[@class="quoteText"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))
        else:
            for quote in self._page_source.xpath('//div[@class=" clearFloats bigBox" and contains(., "Quotes from")]//div[@class="bigBoxContent containerWithHeaderContent"]//span[@class="readable"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))

        return quotes
Пример #16
0
    def _get_quotes(self):
        '''Gets book's quote data'''
        if self._page_source is None:
            return

        quotes_page = self._page_source.xpath('//a[@class="actionLink" and contains(., "More quotes")]')
        quotes = []
        if len(quotes_page) > 0:
            resp = open_url(self._connection, quotes_page[0].get('href'))
            if not resp:
                return
            quotes_page = html.fromstring(resp)
            if quotes_page is None:
                return
            for quote in quotes_page.xpath('//div[@class="quoteText"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))
        else:
            for quote in self._page_source.xpath('//div[@class=" clearFloats bigBox" and contains(., "Quotes from")]//div[@class="bigBoxContent containerWithHeaderContent"]//span[@class="readable"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))

        return quotes
Пример #17
0
    def get_settings(self, entity_id):
        '''Gets book's setting data'''
        if self._page_source is None:
            return

        settings = self._page_source.xpath(
            '//div[@id="bookDataBox"]/div[@class="infoBoxRowItem"]/a[contains(@href, "/places/")]'
        )
        settings_data = {}
        for setting in settings:
            if '/places/' not in setting.get('href'):
                continue
            label = setting.text
            resp = open_url(self._connection, setting.get('href'))
            if not resp:
                continue
            setting_page = html.fromstring(resp)
            if setting_page is None:
                continue
            desc = setting_page.xpath(
                '//div[@class="mainContentContainer "]/div[@class="mainContent"]/div[@class="mainContentFloat"]/div[@class="leftContainer"]/span/text()'
            )
            if len(desc) > 0 and re.sub(r'\s+', ' ', desc[0]).strip():
                desc = str(
                    re.sub(r'\s+', ' ',
                           desc[0]).strip().decode('utf-8').encode('latin-1'))
            else:
                desc = 'No description found on Goodreads.'
            settings_data[entity_id] = {
                'label': str(label.decode('utf-8').encode('latin-1')),
                'description': desc,
                'aliases': []
            }
            entity_id += 1

        return settings_data
Пример #18
0
 def _read_primary_author_page(self, author_info):
     '''Rreads primary author's page and gets his/her bio, image url, and image encoded into base64'''
     author = author_info[0]
     author['page'] = html.fromstring(open_url(self._connection, author['url']))
     author['bio'] = self._get_author_bio(author['page'])
     author['image_url'], author['encoded_image'] = self._get_author_image(author['page'], encode_image=True)
 def _read_primary_author_page(self, author_info):
     '''Rreads primary author's page and gets his/her bio, image url, and image encoded into base64'''
     author = author_info[0]
     author['page'] = html.fromstring(open_url(self._connection, author['url']))
     author['bio'] = self._get_author_bio(author['page'])
     author['image_url'], author['encoded_image'] = self._get_author_image(author['page'], encode_image=True)