Пример #1
0
def db_delete_all_domain_links(domain):
    """
    This method removes the results of a previous crawl of a domain from the database.
    :param domain: A string containing the domain to be un-crawled.
    :return: None
    """
    url = f"%{domain}%"
    sql = "DELETE FROM page_links WHERE page_url LIKE :url;"
    engine.connect().execute(sql, url=url)
Пример #2
0
def db_delete_page(url):
    """
    This method deletes a page from the pages table of the database.
    :param url: A string containing the URl of the web page to delete.
    :return: None
    """
    url = remove_scheme(url)
    sql = "DELETE FROM pages WHERE url LIKE :url"
    engine.connect().execute(sql, url=url)
Пример #3
0
def db_add_clean_text_to_page(url, clean_text):
    """
    This method updates a page in the pages table of the database.
    :param url: A string representing the URL of the web page to update.
    :param clean_text: A string representing the clear main text of the web page to insert.
    """
    url = remove_scheme(url)
    sql = "UPDATE pages SET clean_text=:clean_text WHERE url LIKE :url"
    engine.connect().execute(sql, clean_text=clean_text, url=url)
Пример #4
0
def db_get_page(url):
    """
    This method returns a tuple containing info about the last visit of a web page.
    :param url: A string containing the URL of the web page.
    :return: A tuple (url, topic, summary, language, simple_html, parsed_html, clear_text, last_visit) or None.
    """
    url = remove_scheme(url)
    sql = "SELECT * FROM pages WHERE url LIKE :url"
    result = engine.connect().execute(sql, url=url).fetchone()
    return result
Пример #5
0
def db_get_text_links(page_url):
    """
    This method returns all the links contained in the main text of a web page.
    :param page_url: A string containing the URL of the web page.
    :return: An array containing tuples (position, link_text) with all the info about the links of the web page.
    """
    page_url = remove_scheme(page_url)
    sql = "SELECT position, link_text FROM text_links WHERE page_url LIKE :page_url"
    result = engine.connect().execute(sql, page_url=page_url).fetchall()
    return result
Пример #6
0
def db_last_time_crawled(domain):
    """
    This method returns the timestamp of the last crawl on a website, if it has already been crawled.
    :param domain: A string representing the domain of the website to search.
    :return: The timestamp of the last crawl (if it has been crawled), None otherwise.
    """
    sql = "SELECT * FROM websites WHERE domain LIKE :domain LIMIT 1"
    rows = engine.connect().execute(sql, domain=domain).fetchone()
    # Returns True is it has been crawled, False otherwise.
    if rows is not None:
        return rows[1]
Пример #7
0
def db_get_last_action(user):
    """
    This method retrieves the second to last action performed by the user.
    Then it deletes the last two actions performed by the user.
    :param user: A string that represents the user name.
    :return: A tuple (action, url) containing the second to last action performed by the user.
    """
    sql = text(
        "SELECT action, url FROM history WHERE user LIKE :user ORDER BY id DESC"
    )
    result = engine.connect().execute(sql, user=user).fetchone()
    return result
Пример #8
0
def db_get_text_link(page_url, link_num):
    """
    This method returns a link contained in the main text of a web page.
    :param page_url: A string containing the URL of the web page containing the link.
    :param link_num: A number representing the index of the link to get between all the other links of the text.
    :return: A tuple (link_url) containing the URL of the link requested or None.
    """
    page_url = remove_scheme(page_url)
    sql = "SELECT link_url FROM text_links WHERE page_url LIKE :page_url AND link_num = :link_num"
    result = engine.connect().execute(sql,
                                      page_url=page_url,
                                      link_num=link_num).fetchone()
    return result
Пример #9
0
def get_links_in_list(domain):
    """
    This method analyses the crawl of a domain and returns its menu links, ordered by number DESC.
    :param domain: A string containing the domain to analyse.
    :return: An array of tuples (number, link_text, link_url, avg_x, avg_y) ordered by number DESC.
    """
    page_url = f"%{domain}%"
    sql = """
        SELECT COUNT(*) AS times, link_text, link_url, page_url, in_nav
        FROM page_links
        WHERE page_url LIKE :page_url AND in_list = 1
        GROUP BY link_url
        ORDER BY times DESC
    """
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
Пример #10
0
def db_delete_all_page_links(url):
    url = remove_scheme(url)
    sql = "DELETE FROM page_links WHERE page_url LIKE :url;"
    engine.connect().execute(sql, url=url)
Пример #11
0
def db_delete_website(domain):
    domain = f"%{domain}"
    sql = "DELETE FROM websites WHERE domain LIKE :domain;"
    engine.connect().execute(sql, domain=domain)
Пример #12
0
def db_get_forms(page_url):
    page_url = remove_scheme(page_url)
    sql = """SELECT page_url, form_num, method, action, input_num, input_name, input_text
                FROM forms WHERE page_url LIKE :page_url"""
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
Пример #13
0
def db_get_bookmarks(user):
    sql = """SELECT url, name, user 
                FROM bookmarks WHERE user LIKE :user"""
    rows = engine.connect().execute(sql, user).fetchall()
    return rows
Пример #14
0
def db_delete_bookmark(url, user):
    url = remove_scheme(url)
    sql = "DELETE FROM bookmarks WHERE url LIKE :url AND user LIKE :user;"
    engine.connect().execute(sql, url=url, user=user)
Пример #15
0
def db_get_functionality(page_url):
    sql = """SELECT page_url, type, name, link_url, score
                FROM functionality WHERE page_url LIKE :page_url"""
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
Пример #16
0
def db_get_domain_links(domain):
    page_url = f"%{domain}%"
    sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url"
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
Пример #17
0
def db_delete_last_action(user):
    sql = "DELETE FROM history WHERE id = (SELECT MAX(id) FROM history) and user LIKE :user"
    engine.connect().execute(sql, user=user)
Пример #18
0
def db_get_page_links(url):
    page_url = remove_scheme(url)
    sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url"
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
Пример #19
0
def db_add_parsed_html_to_page(url, parsed_html):
    url = remove_scheme(url)
    sql = "UPDATE pages SET parsed_html=:parsed_html WHERE url LIKE :url"
    engine.connect().execute(sql, parsed_html=parsed_html, url=url)
Пример #20
0
def db_add_topic_to_page(url, topic):
    url = remove_scheme(url)
    sql = "UPDATE pages SET topic=:topic WHERE url LIKE :url"
    engine.connect().execute(sql, topic=topic, url=url)
Пример #21
0
def db_add_language_to_page(url, language):
    url = remove_scheme(url)
    sql = "UPDATE pages SET language=:language WHERE url LIKE :url"
    engine.connect().execute(sql, language=language, url=url)