def __init__(self, query): """ Construct with a search query. :param query: string, the search query to use. """ self.query = query self.db = get_database_connection() self.elapsed_time = None
def get_allowed_domains(): """ Get a list of the domains that are allowed to be indexed. Each domain is a FQDN without the scheme or slashes. For example: 'hsleiden.nl' :return: list """ db = get_database_connection() query = "select distinct domain from urls" cursor = db.cursor() cursor.execute(query) db.commit() return [x['domain'] for x in cursor.fetchall()]
from urllib.parse import urlparse, urlunparse from retrouve.database.model import Model, get_database_connection import psycopg2 db = get_database_connection() class Url(Model): """ Represents a URL as it is stored in the database. """ def __init__(self, **kwargs): """ Construct the URL, and parse the URL into parts right away. :param kwargs: """ super().__init__(**kwargs) if hasattr(self, 'url'): self.parse_url() def parse_url(self): """ Parse the URL into its components, using a base URL when possible. The URL components are stored in the internal __parts property. """ self.__parts = urlparse(self.url) if hasattr(self, 'base'): if isinstance(self.base, str): self.base = urlparse(self.base) elif isinstance(self.base, Url):