def fetch_page(self, page_title): """Returns the ID and title of the non-redirect page corresponding to the provided title, handling titles with incorrect capitalization as well as redirects. Args: page_title: The title of the page to fetch. Returns: (int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was followed. OR None: If no page exists. Raises: ValueError: If the provided page title is invalid. """ sanitized_page_title = helpers.get_sanitized_page_title(page_title) query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;' query_bindings = (sanitized_page_title, ) self.sdow_cursor.execute(query, query_bindings) # Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles # can be matched. results = self.sdow_cursor.fetchall() if not results: raise ValueError( 'Invalid page title {0} provided. Page title does not exist.'. format(page_title)) # First, look for a non-redirect page which has exact match with the page title. for current_page_id, current_page_title, current_page_is_redirect in results: if current_page_title == sanitized_page_title and not current_page_is_redirect: return (current_page_id, helpers.get_readable_page_title(current_page_title), False) # Next, look for a match with a non-redirect page. for current_page_id, current_page_title, current_page_is_redirect in results: if not current_page_is_redirect: return (current_page_id, helpers.get_readable_page_title(current_page_title), False) # If all the results are redirects, use the page to which the first result redirects. query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;' query_bindings = (results[0][0], ) self.sdow_cursor.execute(query, query_bindings) result = self.sdow_cursor.fetchone() # TODO: This will no longer be required once the April 2018 database dump occurs since this # scenario is prevented by the prune_pages_file.py Python script during the database creation. if not result: raise ValueError( 'Invalid page title {0} provided. Page title does not exist.'. format(page_title)) return (result[0], helpers.get_readable_page_title(result[1]), True)
def fetch_page(self, page_title): """Returns the ID and title of the non-redirect page corresponding to the provided title, handling titles with incorrect capitalization as well as redirects. Args: page_title: The title of the page to fetch. Returns: (int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was followed. OR None: If no page exists. Raises: ValueError: If the provided page title is invalid. """ sanitized_page_title = helpers.get_sanitized_page_title(page_title) query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;' query_bindings = (sanitized_page_title,) self.cursor.execute(query, query_bindings) # Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles # can be matched. results = self.cursor.fetchall() if not results: raise ValueError( 'Invalid page title {0} provided. Page title does not exist.'.format(page_title)) # First, look for a non-redirect page which has exact match with the page title. for current_page_id, current_page_title, current_page_is_redirect in results: if current_page_title == sanitized_page_title and not current_page_is_redirect: return (current_page_id, helpers.get_readable_page_title(current_page_title), False) # Next, look for a match with a non-redirect page. for current_page_id, current_page_title, current_page_is_redirect in results: if not current_page_is_redirect: return (current_page_id, helpers.get_readable_page_title(current_page_title), False) # If all the results are redirects, use the page to which the first result redirects. query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;' query_bindings = (results[0][0],) self.cursor.execute(query, query_bindings) result = self.cursor.fetchone() return (result[0], helpers.get_readable_page_title(result[1]), True)