def update_true_status_to_content(self, url): logger.info("update_true_status_to_content" + ' url:' + str(url)) sql = "UPDATE content SET status = TRUE WHERE url = '%s'" % url logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("update_true_status_to_content success") except Exception as e: logger.warning("update_true_status_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def insert_one_to_xpath(self, params): """xpath表中插入一条数据""" logger.info("insert_one_to_xpath " + str(params)) sql = "INSERT INTO xpath (id, url, xpath, point_url) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_one_to_xpath success") except Exception as e: logger.warning("insert_one_to_xpath fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def first_data(self, urls): """初始化数据""" logger.info("first_data") sql = "INSERT INTO content (layer_number, url, status) VALUES (%s, '%s', %s)" % ( 0, urls, 'true') logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("first_data success") except Exception as e: logger.warning("first_data fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def select_url_from_content(self): logger.info("select_url_from_content...") sql = "select url from content WHERE status = false" logger.debug('sql:' + sql) a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() for i in data: a.append(i[0]) logger.info("select_url_from_content success") except Exception as e: logger.warning("select_url_from_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return a
def insert_many_to_relation(self, params): """ relation表中插入多条数据 :param params: :return: """ logger.info("insert_many_to_relation " + str(params)) sql = "INSERT INTO relation (layer_number, id, title, text) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_relation success") except Exception as e: logger.warning("insert_many_to_relation fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def insert_many_to_content(self, params): """ content表中插入多条数据(url, father_url, layer_number) :param params: :return: """ logger.info("insert_many_to_content " + str(params)) sql = "INSERT INTO content (url, father_url, layer_number) VALUES(%s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_content success") except mysql.connector.IntegrityError as e: # 唯一性约束去重 logger.warning("have same url, lose it: %s" % e) except mysql.connector.Error as e: logger.warning("insert_many_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def all_aa(self, url, layer_number): xpath = list() self._driver_open_url(url) page, html = self._parser_by_xml() self.mysql.update_html_to_content(url=url, html=html) tag = self._tag_a_has_href(page) text = [a.text for a in tag] main_handle = self.driver.current_window_handle self._get_xpath(page=page, tag=tag) while len(self.Xpath_list) > 0: xp = self.Xpath_list.pop() te = text.pop() logger.debug("len_of_list: " + str(len(self.Xpath_list))) logger.debug("xpath:" + str(xp)) try: element = self.driver.find_element_by_xpath(xpath=xp) except exceptions.NoSuchElementException as e: logger.warning(str(e)) continue if element.is_enabled() is True: try: element.click() xpath.append(xp) if self.driver.current_url == self.current_url: continue else: self.new_url.append(self.driver.current_url) all_handles = self.driver.window_handles if len(all_handles) > 1: for handle in all_handles: if handle != main_handle: try: self.driver.switch_to.window(handle) self.driver.close() logger.debug("close window: " + str(handle)) except exceptions.NoSuchWindowException as e: logger.warning( "selenium.common.exceptions.NoSuchWindowException: " + str(e)) self.driver.switch_to.window(main_handle) logger.info("back to main_handle") elif len(all_handles) == 1: self.driver.back() else: raise ValueError("window_handle wrong") except: logger.info('throw wrong xpath') continue else: logger.info('throw wrong xpath') continue uid = int(gen_rand_str(length=7, s_type='digit')) logger.debug( '(uid, self.driver.current_url, self.Xpath_list[i], self.driver.current_url):' + str(uid) + self.driver.current_url + str(xp) + self.driver.current_url) self.mysql.insert_one_to_xpath((uid, self.driver.current_url, str(xp), self.driver.current_url)) self.mysql.insert_one_to_relation( (layer_number, uid, self.driver.title, te)) for i in range(len(self.new_url)): if self._check_useful_url(self.new_url[i]): pass else: del self.new_url[i] params = [(Url, url, layer_number) for Url in self.new_url] logger.debug(self.new_url) self.mysql.insert_many_to_content(params=params) self.mysql.update_true_status_to_content(url=url) self.get_back_to_init()