def db_delete_all_domain_links(domain): """ This method removes the results of a previous crawl of a domain from the database. :param domain: A string containing the domain to be un-crawled. :return: None """ url = f"%{domain}%" sql = "DELETE FROM page_links WHERE page_url LIKE :url;" engine.connect().execute(sql, url=url)
def db_delete_page(url): """ This method deletes a page from the pages table of the database. :param url: A string containing the URl of the web page to delete. :return: None """ url = remove_scheme(url) sql = "DELETE FROM pages WHERE url LIKE :url" engine.connect().execute(sql, url=url)
def db_add_clean_text_to_page(url, clean_text): """ This method updates a page in the pages table of the database. :param url: A string representing the URL of the web page to update. :param clean_text: A string representing the clear main text of the web page to insert. """ url = remove_scheme(url) sql = "UPDATE pages SET clean_text=:clean_text WHERE url LIKE :url" engine.connect().execute(sql, clean_text=clean_text, url=url)
def db_get_page(url): """ This method returns a tuple containing info about the last visit of a web page. :param url: A string containing the URL of the web page. :return: A tuple (url, topic, summary, language, simple_html, parsed_html, clear_text, last_visit) or None. """ url = remove_scheme(url) sql = "SELECT * FROM pages WHERE url LIKE :url" result = engine.connect().execute(sql, url=url).fetchone() return result
def db_get_text_links(page_url): """ This method returns all the links contained in the main text of a web page. :param page_url: A string containing the URL of the web page. :return: An array containing tuples (position, link_text) with all the info about the links of the web page. """ page_url = remove_scheme(page_url) sql = "SELECT position, link_text FROM text_links WHERE page_url LIKE :page_url" result = engine.connect().execute(sql, page_url=page_url).fetchall() return result
def db_last_time_crawled(domain): """ This method returns the timestamp of the last crawl on a website, if it has already been crawled. :param domain: A string representing the domain of the website to search. :return: The timestamp of the last crawl (if it has been crawled), None otherwise. """ sql = "SELECT * FROM websites WHERE domain LIKE :domain LIMIT 1" rows = engine.connect().execute(sql, domain=domain).fetchone() # Returns True is it has been crawled, False otherwise. if rows is not None: return rows[1]
def db_get_last_action(user): """ This method retrieves the second to last action performed by the user. Then it deletes the last two actions performed by the user. :param user: A string that represents the user name. :return: A tuple (action, url) containing the second to last action performed by the user. """ sql = text( "SELECT action, url FROM history WHERE user LIKE :user ORDER BY id DESC" ) result = engine.connect().execute(sql, user=user).fetchone() return result
def db_get_text_link(page_url, link_num): """ This method returns a link contained in the main text of a web page. :param page_url: A string containing the URL of the web page containing the link. :param link_num: A number representing the index of the link to get between all the other links of the text. :return: A tuple (link_url) containing the URL of the link requested or None. """ page_url = remove_scheme(page_url) sql = "SELECT link_url FROM text_links WHERE page_url LIKE :page_url AND link_num = :link_num" result = engine.connect().execute(sql, page_url=page_url, link_num=link_num).fetchone() return result
def get_links_in_list(domain): """ This method analyses the crawl of a domain and returns its menu links, ordered by number DESC. :param domain: A string containing the domain to analyse. :return: An array of tuples (number, link_text, link_url, avg_x, avg_y) ordered by number DESC. """ page_url = f"%{domain}%" sql = """ SELECT COUNT(*) AS times, link_text, link_url, page_url, in_nav FROM page_links WHERE page_url LIKE :page_url AND in_list = 1 GROUP BY link_url ORDER BY times DESC """ rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_delete_all_page_links(url): url = remove_scheme(url) sql = "DELETE FROM page_links WHERE page_url LIKE :url;" engine.connect().execute(sql, url=url)
def db_delete_website(domain): domain = f"%{domain}" sql = "DELETE FROM websites WHERE domain LIKE :domain;" engine.connect().execute(sql, domain=domain)
def db_get_forms(page_url): page_url = remove_scheme(page_url) sql = """SELECT page_url, form_num, method, action, input_num, input_name, input_text FROM forms WHERE page_url LIKE :page_url""" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_get_bookmarks(user): sql = """SELECT url, name, user FROM bookmarks WHERE user LIKE :user""" rows = engine.connect().execute(sql, user).fetchall() return rows
def db_delete_bookmark(url, user): url = remove_scheme(url) sql = "DELETE FROM bookmarks WHERE url LIKE :url AND user LIKE :user;" engine.connect().execute(sql, url=url, user=user)
def db_get_functionality(page_url): sql = """SELECT page_url, type, name, link_url, score FROM functionality WHERE page_url LIKE :page_url""" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_get_domain_links(domain): page_url = f"%{domain}%" sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_delete_last_action(user): sql = "DELETE FROM history WHERE id = (SELECT MAX(id) FROM history) and user LIKE :user" engine.connect().execute(sql, user=user)
def db_get_page_links(url): page_url = remove_scheme(url) sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_add_parsed_html_to_page(url, parsed_html): url = remove_scheme(url) sql = "UPDATE pages SET parsed_html=:parsed_html WHERE url LIKE :url" engine.connect().execute(sql, parsed_html=parsed_html, url=url)
def db_add_topic_to_page(url, topic): url = remove_scheme(url) sql = "UPDATE pages SET topic=:topic WHERE url LIKE :url" engine.connect().execute(sql, topic=topic, url=url)
def db_add_language_to_page(url, language): url = remove_scheme(url) sql = "UPDATE pages SET language=:language WHERE url LIKE :url" engine.connect().execute(sql, language=language, url=url)