class BoxDriver(object): """ a simple usage of selenium framework tool """ """ 私有全局变量 """ _web_driver = None _by_char = None _wait_seconds = None """ 构造方法 """ class DriverType(Enum): CHROME = 1, FIREFOX = 2, IE = 3, SAFARI = 4, CHROME_HEADLESS = 5 def __init__(self, driver_type: DriverType, by_char=_CHARACTER_COMMA, wait_seconds=_WAIT_SECONDS, firefox_profile=None): """ 构造方法:实例化 BoxDriver 时候使用 :type wait_seconds: object :param driver_type: DriverType: selenium driver :param by_char: 分隔符,默认使用"," :param firefox_profile: 火狐浏览器配置 """ self._by_char = by_char self._wait_seconds = wait_seconds if driver_type is None or driver_type == "": driver_type = self.DriverType.CHROME self._set_selenium_driver(driver_type, firefox_profile) def _set_selenium_driver(self, driver_type, firefox_profile): if driver_type == self.DriverType.CHROME: self._web_driver = Chrome() elif driver_type == self.DriverType.FIREFOX: if firefox_profile and os.path.exists(firefox_profile): profile = FirefoxProfile(firefox_profile) self._web_driver = Firefox(firefox_profile=profile) else: self._web_driver = Firefox() elif driver_type == self.DriverType.IE: self._web_driver = Ie() elif driver_type == self.DriverType.SAFARI: self._web_driver = Safari() elif driver_type == self.DriverType.CHROME_HEADLESS: profile = ChromeOptions() profile.add_argument('headless') profile.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"]) self._web_driver = Chrome(options=profile) else: self._web_driver = Chrome() print("Invalid Driver Type filled: %r" % driver_type) """ 私有方法 """ def _convert_selector_to_locator(self, selector): """ 转换自定义的 selector 为 Selenium 支持的 locator :param selector: 定位字符,字符串类型,"i, xxx" :return: locator """ if self._by_char not in selector: return By.ID, selector selector_by = selector.split(self._by_char)[0].strip() selector_value = selector.split(self._by_char)[1].strip() if selector_by == "i" or selector_by == 'id': locator = (By.ID, selector_value) elif selector_by == "n" or selector_by == 'name': locator = (By.NAME, selector_value) elif selector_by == "c" or selector_by == 'class_name': locator = (By.CLASS_NAME, selector_value) elif selector_by == "l" or selector_by == 'link_text': locator = (By.LINK_TEXT, selector_value) elif selector_by == "p" or selector_by == 'partial_link_text': locator = (By.PARTIAL_LINK_TEXT, selector_value) elif selector_by == "t" or selector_by == 'tag_name': locator = (By.TAG_NAME, selector_value) elif selector_by == "x" or selector_by == 'xpath': locator = (By.XPATH, selector_value) elif selector_by == "s" or selector_by == 'css_selector': locator = (By.CSS_SELECTOR, selector_value) else: raise NameError( "Please enter a valid selector of targeting elements.") return locator def _locate_element(self, selector): """ to locate element by selector :arg selector should be passed by an example with "i,xxx" "x,//*[@id='langs']/button" :returns DOM element """ locator = self._convert_selector_to_locator(selector) if locator is not None: element = self._web_driver.find_element(*locator) else: raise NameError( "Please enter a valid locator of targeting elements.") return element def _locate_elements(self, selector): """ to locate element by selector :arg selector should be passed by an example with "i,xxx" "x,//*[@id='langs']/button" :returns DOM element """ locator = self._convert_selector_to_locator(selector) if locator is not None: elements = self._web_driver.find_elements(*locator) else: raise NameError( "Please enter a valid locator of targeting elements.") return elements """ cookie 相关方法 """ def clear_cookies(self): """ clear all cookies after driver init """ self._web_driver.delete_all_cookies() def add_cookies(self, cookies): """ Add cookie by dict :param cookies: :return: """ self._web_driver.add_cookie(cookie_dict=cookies) def add_cookie(self, cookie_dict): """ Add single cookie by dict 添加 单个 cookie 如果该 cookie 已经存在,就先删除后,再添加 :param cookie_dict: 字典类型,有两个key:name 和 value :return: """ cookie_name = cookie_dict["name"] cookie_value = self._web_driver.get_cookie(cookie_name) if cookie_value is not None: self._web_driver.delete_cookie(cookie_name) self._web_driver.add_cookie(cookie_dict) def remove_cookie(self, name): """ 移除指定 name 的cookie :param name: :return: """ # 检查 cookie 是否存在,存在就移除 old_cookie_value = self._web_driver.get_cookie(name) if old_cookie_value is not None: self._web_driver.delete_cookie(name) """ 浏览器本身相关方法 """ def refresh(self, url=None): """ 刷新页面 如果 url 是空值,就刷新当前页面,否则就刷新指定页面 :param url: 默认值是空的 :return: """ if url is None: self._web_driver.refresh() else: self._web_driver.get(url) self.forced_wait(self._wait_seconds) def maximize_window(self): """ 最大化当前浏览器的窗口 :return: """ self._web_driver.maximize_window() def navigate(self, url): """ 打开 URL :param url: :return: """ self._web_driver.get(url) self.forced_wait(self._wait_seconds) def quit(self): """ 退出驱动 :return: """ self._web_driver.quit() def close_browser(self): """ 关闭浏览器 :return: """ self._web_driver.close() """ 基本元素相关方法 """ def type(self, selector, text): """ Operation input box. Usage: driver.type("i,el","selenium") """ el = self._locate_element(selector) el.clear() el.send_keys(text) def click(self, selector): """ It can click any text / image can be clicked Connection, check box, radio buttons, and even drop-down box etc.. Usage: driver.click("i,el") """ el = self._locate_element(selector) el.click() self.forced_wait(self._wait_seconds) def click_by_enter(self, selector): """ It can type any text / image can be located with ENTER key Usage: driver.click_by_enter("i,el") """ el = self._locate_element(selector) el.send_keys(Keys.ENTER) self.forced_wait(self._wait_seconds) def click_by_text(self, text): """ Click the element by the link text Usage: driver.click_text("新闻") """ self._locate_element('p%s' % self._by_char + text).click() self.forced_wait(self._wait_seconds) def submit(self, selector): """ Submit the specified form. Usage: driver.submit("i,el") """ el = self._locate_element(selector) el.submit() self.forced_wait(self._wait_seconds) def move_to(self, selector): """ to move mouse pointer to selector :param selector: :return: """ el = self._locate_element(selector) ActionChains(self._web_driver).move_to_element(el).perform() self.forced_wait(self._wait_seconds) def right_click(self, selector): """ to click the selector by the right button of mouse :param selector: :return: """ el = self._locate_element(selector) ActionChains(self._web_driver).context_click(el).perform() self.forced_wait(self._wait_seconds) def count_elements(self, selector): """ 数一下元素的个数 :param selector: 定位符 :return: """ els = self._locate_elements(selector) return len(els) def drag_element(self, source, target): """ 拖拽元素 :param source: :param target: :return: """ el_source = self._locate_element(source) el_target = self._locate_element(target) if self._web_driver.w3c: ActionChains(self._web_driver).drag_and_drop(el_source, el_target).perform() else: ActionChains(self._web_driver).click_and_hold(el_source).perform() ActionChains(self._web_driver).move_to_element(el_target).perform() ActionChains(self._web_driver).release(el_target).perform() self.forced_wait(self._wait_seconds) def lost_focus(self): """ 当前元素丢失焦点 :return: """ ActionChains(self._web_driver).key_down(Keys.TAB).key_up( Keys.TAB).perform() self.forced_wait(self._wait_seconds) """ <select> 元素相关 """ def select_by_index(self, selector, index): """ It can click any text / image can be clicked Connection, check box, radio buttons, and even drop-down box etc.. Usage: driver.select_by_index("i,el") """ el = self._locate_element(selector) Select(el).select_by_index(index) self.forced_wait(self._wait_seconds) def get_selected_text(self, selector): """ 获取 Select 元素的选择的内容 :param selector: 选择字符 "i, xxx" :return: 字符串 """ el = self._locate_element(selector) selected_opt = Select(el).first_selected_option() return selected_opt.text def select_by_visible_text(self, selector, text): """ It can click any text / image can be clicked Connection, check box, radio buttons, and even drop-down box etc.. Usage: driver.select_by_index("i,el") """ el = self._locate_element(selector) Select(el).select_by_visible_text(text) self.forced_wait(self._wait_seconds) def select_by_value(self, selector, value): """ It can click any text / image can be clicked Connection, check box, radio buttons, and even drop-down box etc.. Usage: driver.select_by_index("i,el") """ el = self._locate_element(selector) Select(el).select_by_value(value) self.forced_wait(self._wait_seconds) """ JavaScript 相关 """ def execute_js(self, script): """ Execute JavaScript scripts. Usage: driver.js("window.scrollTo(200,1000);") """ self._web_driver.execute_script(script) self.forced_wait(self._wait_seconds) """ 元素属性相关方法 """ def get_value(self, selector): """ 返回元素的 value :param selector: 定位字符串 :return: """ el = self._locate_element(selector) return el.get_attribute("value") def get_attribute(self, selector, attribute): """ Gets the value of an element attribute. Usage: driver.get_attribute("i,el","type") """ el = self._locate_element(selector) return el.get_attribute(attribute) def get_text(self, selector): """ Get element text information. Usage: driver.get_text("i,el") """ el = self._locate_element(selector) return el.text def get_displayed(self, selector): """ Gets the element to display,The return result is true or false. Usage: driver.get_display("i,el") """ el = self._locate_element(selector) return el.is_displayed() def get_selected(self, selector): """ to return the selected status of an WebElement :param selector: selector to locate :return: True False """ el = self._locate_element(selector) return el.is_selected() def get_text_list(self, selector): """ 根据selector 获取多个元素,取得元素的text 列表 :param selector: :return: list """ el_list = self._locate_elements(selector) results = [] for el in el_list: results.append(el.text) return results """ 窗口相关方法 """ def accept_alert(self): ''' Accept warning box. Usage: driver.accept_alert() ''' self._web_driver.switch_to.alert.accept() self.forced_wait(self._wait_seconds) def dismiss_alert(self): ''' Dismisses the alert available. Usage: driver.dismissAlert() ''' self._web_driver.switch_to.alert.dismiss() self.forced_wait(self._wait_seconds) def switch_to_frame(self, selector): """ Switch to the specified frame. Usage: driver.switch_to_frame("i,el") """ el = self._locate_element(selector) self._web_driver.switch_to.frame(el) self.forced_wait(self._wait_seconds) def switch_to_default(self): """ Returns the current form machine form at the next higher level. Corresponding relationship with switch_to_frame () method. Usage: driver.switch_to_default() """ self._web_driver.switch_to.default_content() self.forced_wait(self._wait_seconds) def switch_to_parent(self): """ switch to parent frame :return: """ self._web_driver.switch_to.parent_frame() self.forced_wait(self._wait_seconds) def switch_to_window_by_title(self, title): for handle in self._web_driver.window_handles: self._web_driver.switch_to.window(handle) if self._web_driver.title == title: break self._web_driver.switch_to.default_content() self.forced_wait(self._wait_seconds) def open_new_window(self, selector): ''' Open the new window and switch the handle to the newly opened window. Usage: driver.open_new_window() ''' original_windows = self._web_driver.current_window_handle el = self._locate_element(selector) el.click() all_handles = self._web_driver.window_handles for handle in all_handles: if handle != original_windows: self._web_driver.switch_to.window(handle) break def save_window_snapshot(self, file_name): """ save screen snapshot :param file_name: the image file name and path :return: """ driver = self._web_driver driver.save_screenshot(file_name) self.forced_wait(self._wait_seconds) def save_window_snapshot_by_png(self): return self._web_driver.get_screenshot_as_png() def save_element_snapshot_by_png(self, selector): """ 控件截图 :param selector: :return: """ el = self._locate_element(selector) self.forced_wait(self._wait_seconds) return el.screenshot_as_png def save_window_snapshot_by_io(self): """ 保存截图为文件流 :return: """ return self._web_driver.get_screenshot_as_base64() def save_element_snapshot_by_io(self, selector): """ 控件截图 :param selector: :return: """ el = self._locate_element(selector) return el.screenshot_as_base64 """ 等待方法 """ @staticmethod def forced_wait(seconds): """ 强制等待 :param seconds: :return: """ time.sleep(seconds) def implicitly_wait(self, seconds): """ Implicitly wait. All elements on the page. :param seconds 等待时间 秒 隐式等待 Usage: driver.implicitly_wait(10) """ self._web_driver.implicitly_wait(seconds) def explicitly_wait(self, selector, seconds): """ 显式等待 :param selector: 定位字符 :param seconds: 最长等待时间,秒 :return: """ locator = self._convert_selector_to_locator(selector) WebDriverWait(self._web_driver, seconds).until( expected_conditions.presence_of_element_located(locator)) def get_explicitly_wait_element_text(self, selector, seconds): """ 显式等待,得到元素的 text :param selector: locator :param seconds: max timeout sencods :return: str, element.text """ locator = self._convert_selector_to_locator(selector) driver = self._web_driver el = WebDriverWait(driver, seconds).until(lambda d: d.find_element(*locator)) if el and isinstance(el, WebElement): return el.text return None """ 属性 """ @property def current_title(self): ''' Get window title. Usage: driver.current_title ''' return self._web_driver.title @property def current_url(self): """ Get the URL address of the current page. Usage: driver.current_url """ return self._web_driver.current_url
class YouTube_Crawler: api_key = None kwonjun_api_key = None kyungsu_api_key = None is_driver = False IP = #IP database = #database user = #user password = #password def __init__(self, api_key=None): if api_key is not None: self.api_key = api_key def make_driver_ready(self): options = ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--enable-automation") # options.headless = True options.add_argument("--disable-gpu") options.add_argument("--disable-features=VizDisplayCompositor") # options.add_argument('--disable-dev-shm-usage') # options.add_argument("disable-gpu") self.driver = Chrome( executable_path=r"/home/ubuntu/Crawler/chromedriver", # self.driver = Chrome(executable_path=r"chromedriver", options=options, ) # ,chrome_options=options self.driver.set_window_size(1920, 1080) self.driver.get("https://www.youtube.com/") self.driver.implicitly_wait(5) self.driver.delete_cookie("PREF") self.driver.add_cookie( { "domain": ".youtube.com", "httpOnly": False, "name": "PREF", "value": "gl=US&hl=en", "path": "/", } ) self.driver.get("https://www.youtube.com/") self.driver.implicitly_wait(5) self.is_driver = True def pre_process_sql(self, text): # temp = bytearray(text.encode('UTF-8')) # temp.replace(b'\x00', b'') # temp = temp.decode('utf-8', 'ignore') # re.sub("\"", " ", temp) return re.sub("'", "''", text) def pre_process_comment(self, text): temp = bytearray(text.encode("UTF-8")) temp.replace(b"\x00", b"") text = temp.decode("utf-8", "ignore") # re.sub("\"", " ", temp) return re.sub("'", "''", text) def update_video_and_comment(self, video_id): if New_YouTube_Crawler_Comment.main(video_id): return True else: return False def update_channel_info(self, channel_id, api_set=0): if api_set == 0: api_key = self.api_key elif api_set == 1: api_key = self.kwonjun_api_key else: api_key = self.kyungsu_api_key try: time.sleep(0.2) url = f"""https://www.googleapis.com/youtube/v3/channels?part=statistics&maxResults=50&id={channel_id}&key={api_key}""" response = requests.get(url) if response.status_code != 200: # print("response error: ", url) return False result = response.json() item = dict(*result["items"]) try: check = item["statistics"]["subscriberCount"] except: conn = pg2.connect( database = self.database, user = self.user, password = self.password, host = self.IP, port = "5432", ) conn.autocommit = False cur = conn.cursor() sql = f"""UPDATE channel SET hidden_subscriber = true WHERE channel_id = '{channel_id}';""" cur.execute(sql) conn.commit() conn.close() return True conn = pg2.connect( database = self.database, user = self.user, password = self.password, host = self.IP, port = "5432", ) conn.autocommit = False cur = conn.cursor() sql = f"""INSERT INTO channel_subscriber (channel_idx, subscriber_num, check_time) VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['subscriberCount']}', to_timestamp({time.time()})); INSERT INTO channel_views (channel_idx, view_count, check_time) VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['viewCount']}', to_timestamp({time.time()}));""" cur.execute(sql) conn.commit() conn.close() return True except Exception as e: # print(traceback.format_exc()) # print("ERROR", e) return False def insert_channel_info(self, channel_id): try: url = f"""https://www.googleapis.com/youtube/v3/channels?part=id,snippet,contentDetails,statistics,topicDetails&maxResults=50&id={channel_id}&key={self.api_key}""" response = requests.get(url) if response.status_code != 200: # print("response error: ", url) return False result = response.json() item = dict(*result["items"]) conn = pg2.connect( database = database, user = user, password = password, host = self.IP, port="5432", ) conn.autocommit = False cur = conn.cursor() sql = f"""UPDATE channel SET channel_name = '{self.pre_process_sql(item['snippet']["title"])}', channel_description = '{self.pre_process_sql(item['snippet']['description'])}', channel_start_date = to_date('{item['snippet']['publishedAt']}', 'YYYY-MM-DD'), upload_id = '{item['contentDetails']['relatedPlaylists']['uploads']}', hidden_subscriber = {item['statistics']['hiddenSubscriberCount']}, thumbnail_url = '{item['snippet']['thumbnails']['default']['url']}' WHERE channel_id = '{channel_id}'; INSERT INTO channel_subscriber (channel_idx, subscriber_num, check_time) VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['subscriberCount']}', to_timestamp({time.time()}));""" cur.execute(sql) conn.commit() conn.close() return True except Exception as e: # print(traceback.format_exc()) # print("ERROR", e) return False def update_video_info(self, upload_id, interval_day=30, api_set=0): if api_set == 0: api_key = self.api_key elif api_set == 1: api_key = self.kwonjun_api_key else: api_key = self.kyungsu_api_key try: next_page_token = None keep_going = True conn = pg2.connect( database = self.database, user = self.user, password = self.password, host = self.IP, port = "5432", ) conn.autocommit = False cur = conn.cursor() while keep_going: if next_page_token is None: url = f"""https://www.googleapis.com/youtube/v3/playlistItems?part=id,snippet,contentDetails,status&maxResults=50&playlistId={upload_id}&key={api_key}""" else: url = f"""https://www.googleapis.com/youtube/v3/playlistItems?part=id,snippet,contentDetails,status&maxResults=50&pageToken={next_page_token}&playlistId={upload_id}&key={api_key}""" response = requests.get(url) if response.status_code != 200: pass # # print("response error: ", url) result = response.json() try: next_page_token = result["nextPageToken"] except: next_page_token = None keep_going = False for items in result["items"]: item = dict(items) try: upload_time = time.strptime( item["contentDetails"]["videoPublishedAt"], "%Y-%m-%dT%H:%M:%SZ", ) except: upload_time = time.strptime( item["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ" ) # # print(upload_time) sql = f"""INSERT INTO video (channel_idx, video_id, upload_time, status) VALUES ((SELECT idx from channel where upload_id = '{upload_id}'), '{item['contentDetails']['videoId']}', to_timestamp('{item['snippet']['publishedAt']}', 'YYYY-MM-DDTHH24:MI:SSZ'), FALSE) ON CONFLICT DO NOTHING;""" cur.execute(sql) # # print("Disabled Video", item["contentDetails"]["videoId"]) if (time.mktime(time.localtime()) - time.mktime(upload_time)) / ( 60 * 60 * 24 ) <= interval_day: pass else: keep_going = False break continue # 90일 이내의 영상 # 2020-07-31T12:05:06Z if (time.mktime(time.localtime()) - time.mktime(upload_time)) / ( 60 * 60 * 24 ) <= interval_day: sql = f"""SELECT insert_video('{self.pre_process_sql(item['snippet']['title'])}', '{self.pre_process_sql(item['snippet']['description'])}', '{item['contentDetails']['videoId']}', '{item['contentDetails']['videoPublishedAt']}', '{upload_id}', '{item['snippet']['thumbnails']['high']['url']}')""" cur.execute(sql) success = cur.fetchone()[0] if not success: keep_going = False break else: keep_going = False break conn.commit() conn.close() return True except Exception as e: # print(traceback.format_exc()) # print("ERROR", e) return False def __del__(self): if self.is_driver: self.driver.close()