def extract_forms(url): """ This method searches in the web-page if there is an input form used to search something in the page. :return: (List of input_forms, list of texts) """ url = remove_scheme(url) page = db_get_page(url=url) webpage = BeautifulSoup(page[4], "lxml") forms = webpage.find_all(name="form") for i, form in enumerate(forms): method = form.get("method") action = form.get("action") inputs = form.find_all(name="input") inputs = [ input for input in inputs if input.get("type") == "search" or input.get("type") == "text" ] for j, input in enumerate(inputs): input_name = input.get("name") input_text = input.get("placeholder") db_insert_form(page_url=url, form_num=i, method=method, action=action, input_num=j, input_name=input_name, input_text=input_text)
def db_insert_page_link(page_url, link_url, link_text, x_position, y_position, in_list, in_nav): page_url = remove_scheme(page_url) link_url = remove_scheme(link_url) page_link = PageLink(page_url=page_url, link_url=link_url, link_text=link_text, x_position=x_position, y_position=y_position, in_list=in_list, in_nav=in_nav) session = db_session() session.add(page_link) session.commit() session.close()
def db_delete_page(url): """ This method deletes a page from the pages table of the database. :param url: A string containing the URl of the web page to delete. :return: None """ url = remove_scheme(url) sql = "DELETE FROM pages WHERE url LIKE :url" engine.connect().execute(sql, url=url)
def db_add_clean_text_to_page(url, clean_text): """ This method updates a page in the pages table of the database. :param url: A string representing the URL of the web page to update. :param clean_text: A string representing the clear main text of the web page to insert. """ url = remove_scheme(url) sql = "UPDATE pages SET clean_text=:clean_text WHERE url LIKE :url" engine.connect().execute(sql, clean_text=clean_text, url=url)
def db_get_text_links(page_url): """ This method returns all the links contained in the main text of a web page. :param page_url: A string containing the URL of the web page. :return: An array containing tuples (position, link_text) with all the info about the links of the web page. """ page_url = remove_scheme(page_url) sql = "SELECT position, link_text FROM text_links WHERE page_url LIKE :page_url" result = engine.connect().execute(sql, page_url=page_url).fetchall() return result
def db_get_page(url): """ This method returns a tuple containing info about the last visit of a web page. :param url: A string containing the URL of the web page. :return: A tuple (url, topic, summary, language, simple_html, parsed_html, clear_text, last_visit) or None. """ url = remove_scheme(url) sql = "SELECT * FROM pages WHERE url LIKE :url" result = engine.connect().execute(sql, url=url).fetchone() return result
def read_links_article(url): url = remove_scheme(url) links = db_get_page_links(url=url) if len(links) > 0: # Keep only links with 4 words or more in text. links = list(filter(lambda x: len(extract_words(x[0])) > 3, links)) # Keep only links not contained in lists. links = list(filter(lambda x: x[3] == 0, links)) # Remove duplicates. links = remove_duplicate_links(links) return links
def process_item(self, item, spider): """ This method is called for every item pipeline component """ session = self.Session() try: link = PageLink() link.page_url = remove_scheme(item["page_url"]) link.link_url = remove_scheme(item["link_url"]) link.link_text = item["link_text"] link.in_list = item["in_list"] link.in_nav = item["in_nav"] session.add(link) session.commit() except Exception: session.rollback() raise finally: session.close() return item
def db_get_text_link(page_url, link_num): """ This method returns a link contained in the main text of a web page. :param page_url: A string containing the URL of the web page containing the link. :param link_num: A number representing the index of the link to get between all the other links of the text. :return: A tuple (link_url) containing the URL of the link requested or None. """ page_url = remove_scheme(page_url) sql = "SELECT link_url FROM text_links WHERE page_url LIKE :page_url AND link_num = :link_num" result = engine.connect().execute(sql, page_url=page_url, link_num=link_num).fetchone() return result
def db_insert_bookmark(url, name, user): url = remove_scheme(url) bookmark = Bookmark(url=url, name=name, user=user) session = db_session() try: session.add(bookmark) session.commit() except Exception as e: if "url" in e.args[0]: raise BookmarkUrlTaken elif "name" in e.args[0]: raise BookmarkNameTaken finally: session.close()
def db_insert_action(action, url): """ This method inserts an action performed by the user into the history table of the database. :param action: A string indicating the action performed by the user. :param url: The url of the web page related to the action performed by the user. """ url = remove_scheme(url) session = db_session() history = History(user="******", action=action, url=url, timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) session.add(history) session.commit() session.close()
def db_insert_functionality_link(page_url, name, link_url, score): page_url = remove_scheme(page_url) session = db_session() functionality = Functionality(page_url=page_url, type="link", name=name, link_url=link_url, score=score) try: session.add(functionality) session.commit() except Exception: # Exception if link already present. session.rollback() finally: session.close()
def db_insert_page(url, simple_html): """ This method inserts a web page in the pages table of the database. """ url = remove_scheme(url) page = Page(url=url, simple_html=simple_html, last_visit=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), parsed_html="In progress.") session = db_session() try: session.add(page) session.commit() except IntegrityError: session.rollback() finally: session.close()
def db_insert_text_link(page_url, link_num, link): """ This method inserts a link contained in the main text of a web page into the page_links table of the database. :param page_url: A string containing the URL of the web page containing the link. :param link_num: A number representing the index of the link to insert between all the other links of the text. :param link: A tuple (position, link_text, link_url) containing info about the link. :return: None. """ page_url = remove_scheme(page_url) text_link = TextLink(page_url=page_url, link_num=link_num, position=link[0], link_text=link[1], link_url=link[2]) session = db_session() session.add(text_link) session.commit() session.close()
def db_insert_form(page_url, form_num, method, action, input_num, input_name, input_text): page_url = remove_scheme(page_url) form = Form(page_url=page_url, form_num=form_num, method=method, action=action, input_num=input_num, input_name=input_name, input_text=input_text) session = db_session() try: session.add(form) session.commit() except Exception: session.rollback() finally: session.close()
def db_delete_all_page_links(url): url = remove_scheme(url) sql = "DELETE FROM page_links WHERE page_url LIKE :url;" engine.connect().execute(sql, url=url)
def db_add_parsed_html_to_page(url, parsed_html): url = remove_scheme(url) sql = "UPDATE pages SET parsed_html=:parsed_html WHERE url LIKE :url" engine.connect().execute(sql, parsed_html=parsed_html, url=url)
def db_get_forms(page_url): page_url = remove_scheme(page_url) sql = """SELECT page_url, form_num, method, action, input_num, input_name, input_text FROM forms WHERE page_url LIKE :page_url""" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows
def db_delete_bookmark(url, user): url = remove_scheme(url) sql = "DELETE FROM bookmarks WHERE url LIKE :url AND user LIKE :user;" engine.connect().execute(sql, url=url, user=user)
def db_add_topic_to_page(url, topic): url = remove_scheme(url) sql = "UPDATE pages SET topic=:topic WHERE url LIKE :url" engine.connect().execute(sql, topic=topic, url=url)
def read_links(url): url = remove_scheme(url) links = db_get_page_links(url=url) links = remove_duplicate_links(links) return links
def db_add_language_to_page(url, language): url = remove_scheme(url) sql = "UPDATE pages SET language=:language WHERE url LIKE :url" engine.connect().execute(sql, language=language, url=url)
def db_get_page_links(url): page_url = remove_scheme(url) sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url" rows = engine.connect().execute(sql, page_url=page_url).fetchall() return rows