예제 #1
0
    def fetch_page(self, page_title):
        """Returns the ID and title of the non-redirect page corresponding to the provided title,
    handling titles with incorrect capitalization as well as redirects.

    Args:
      page_title: The title of the page to fetch.

    Returns:
      (int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was
      followed.
      OR
      None: If no page exists.

    Raises:
      ValueError: If the provided page title is invalid.
    """
        sanitized_page_title = helpers.get_sanitized_page_title(page_title)

        query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;'
        query_bindings = (sanitized_page_title, )
        self.sdow_cursor.execute(query, query_bindings)

        # Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles
        # can be matched.
        results = self.sdow_cursor.fetchall()

        if not results:
            raise ValueError(
                'Invalid page title {0} provided. Page title does not exist.'.
                format(page_title))

        # First, look for a non-redirect page which has exact match with the page title.
        for current_page_id, current_page_title, current_page_is_redirect in results:
            if current_page_title == sanitized_page_title and not current_page_is_redirect:
                return (current_page_id,
                        helpers.get_readable_page_title(current_page_title),
                        False)

        # Next, look for a match with a non-redirect page.
        for current_page_id, current_page_title, current_page_is_redirect in results:
            if not current_page_is_redirect:
                return (current_page_id,
                        helpers.get_readable_page_title(current_page_title),
                        False)

        # If all the results are redirects, use the page to which the first result redirects.
        query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;'
        query_bindings = (results[0][0], )
        self.sdow_cursor.execute(query, query_bindings)

        result = self.sdow_cursor.fetchone()

        # TODO: This will no longer be required once the April 2018 database dump occurs since this
        # scenario is prevented by the prune_pages_file.py Python script during the database creation.
        if not result:
            raise ValueError(
                'Invalid page title {0} provided. Page title does not exist.'.
                format(page_title))

        return (result[0], helpers.get_readable_page_title(result[1]), True)
예제 #2
0
  def fetch_page(self, page_title):
    """Returns the ID and title of the non-redirect page corresponding to the provided title,
    handling titles with incorrect capitalization as well as redirects.

    Args:
      page_title: The title of the page to fetch.

    Returns:
      (int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was
      followed.
      OR
      None: If no page exists.

    Raises:
      ValueError: If the provided page title is invalid.
    """
    sanitized_page_title = helpers.get_sanitized_page_title(page_title)

    query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;'
    query_bindings = (sanitized_page_title,)
    self.cursor.execute(query, query_bindings)

    # Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles
    # can be matched.
    results = self.cursor.fetchall()

    if not results:
      raise ValueError(
          'Invalid page title {0} provided. Page title does not exist.'.format(page_title))

    # First, look for a non-redirect page which has exact match with the page title.
    for current_page_id, current_page_title, current_page_is_redirect in results:
      if current_page_title == sanitized_page_title and not current_page_is_redirect:
        return (current_page_id, helpers.get_readable_page_title(current_page_title), False)

    # Next, look for a match with a non-redirect page.
    for current_page_id, current_page_title, current_page_is_redirect in results:
      if not current_page_is_redirect:
        return (current_page_id, helpers.get_readable_page_title(current_page_title), False)

    # If all the results are redirects, use the page to which the first result redirects.
    query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;'
    query_bindings = (results[0][0],)
    self.cursor.execute(query, query_bindings)

    result = self.cursor.fetchone()

    return (result[0], helpers.get_readable_page_title(result[1]), True)