Пример #1
0
class BoxDriver(object):
    """
    a simple usage of selenium framework tool
    """
    """
    私有全局变量
    """
    _web_driver = None
    _by_char = None
    _wait_seconds = None
    """
    构造方法
    """
    class DriverType(Enum):
        CHROME = 1,
        FIREFOX = 2,
        IE = 3,
        SAFARI = 4,
        CHROME_HEADLESS = 5

    def __init__(self,
                 driver_type: DriverType,
                 by_char=_CHARACTER_COMMA,
                 wait_seconds=_WAIT_SECONDS,
                 firefox_profile=None):
        """
        构造方法:实例化 BoxDriver 时候使用
        :type wait_seconds: object
        :param driver_type: DriverType: selenium driver
        :param by_char: 分隔符,默认使用","
        :param firefox_profile: 火狐浏览器配置
        """
        self._by_char = by_char
        self._wait_seconds = wait_seconds

        if driver_type is None or driver_type == "":
            driver_type = self.DriverType.CHROME

        self._set_selenium_driver(driver_type, firefox_profile)

    def _set_selenium_driver(self, driver_type, firefox_profile):

        if driver_type == self.DriverType.CHROME:
            self._web_driver = Chrome()

        elif driver_type == self.DriverType.FIREFOX:

            if firefox_profile and os.path.exists(firefox_profile):
                profile = FirefoxProfile(firefox_profile)
                self._web_driver = Firefox(firefox_profile=profile)
            else:
                self._web_driver = Firefox()
        elif driver_type == self.DriverType.IE:
            self._web_driver = Ie()

        elif driver_type == self.DriverType.SAFARI:
            self._web_driver = Safari()

        elif driver_type == self.DriverType.CHROME_HEADLESS:
            profile = ChromeOptions()
            profile.add_argument('headless')
            profile.add_experimental_option("excludeSwitches",
                                            ["ignore-certificate-errors"])
            self._web_driver = Chrome(options=profile)

        else:
            self._web_driver = Chrome()
            print("Invalid Driver Type filled: %r" % driver_type)

    """
    私有方法
    """

    def _convert_selector_to_locator(self, selector):
        """
        转换自定义的 selector 为 Selenium 支持的 locator
        :param selector: 定位字符,字符串类型,"i, xxx"
        :return: locator
        """
        if self._by_char not in selector:
            return By.ID, selector

        selector_by = selector.split(self._by_char)[0].strip()
        selector_value = selector.split(self._by_char)[1].strip()
        if selector_by == "i" or selector_by == 'id':
            locator = (By.ID, selector_value)
        elif selector_by == "n" or selector_by == 'name':
            locator = (By.NAME, selector_value)
        elif selector_by == "c" or selector_by == 'class_name':
            locator = (By.CLASS_NAME, selector_value)
        elif selector_by == "l" or selector_by == 'link_text':
            locator = (By.LINK_TEXT, selector_value)
        elif selector_by == "p" or selector_by == 'partial_link_text':
            locator = (By.PARTIAL_LINK_TEXT, selector_value)
        elif selector_by == "t" or selector_by == 'tag_name':
            locator = (By.TAG_NAME, selector_value)
        elif selector_by == "x" or selector_by == 'xpath':
            locator = (By.XPATH, selector_value)
        elif selector_by == "s" or selector_by == 'css_selector':
            locator = (By.CSS_SELECTOR, selector_value)
        else:
            raise NameError(
                "Please enter a valid selector of targeting elements.")

        return locator

    def _locate_element(self, selector):
        """
        to locate element by selector
        :arg
        selector should be passed by an example with "i,xxx"
        "x,//*[@id='langs']/button"
        :returns
        DOM element
        """
        locator = self._convert_selector_to_locator(selector)
        if locator is not None:
            element = self._web_driver.find_element(*locator)
        else:
            raise NameError(
                "Please enter a valid locator of targeting elements.")

        return element

    def _locate_elements(self, selector):
        """
        to locate element by selector
        :arg
        selector should be passed by an example with "i,xxx"
        "x,//*[@id='langs']/button"
        :returns
        DOM element
        """
        locator = self._convert_selector_to_locator(selector)
        if locator is not None:
            elements = self._web_driver.find_elements(*locator)
        else:
            raise NameError(
                "Please enter a valid locator of targeting elements.")

        return elements

    """
    cookie 相关方法
    """

    def clear_cookies(self):
        """
        clear all cookies after driver init
        """
        self._web_driver.delete_all_cookies()

    def add_cookies(self, cookies):
        """
        Add cookie by dict
        :param cookies:
        :return:
        """
        self._web_driver.add_cookie(cookie_dict=cookies)

    def add_cookie(self, cookie_dict):
        """
        Add single cookie by dict
        添加 单个 cookie
        如果该 cookie 已经存在,就先删除后,再添加
        :param cookie_dict: 字典类型,有两个key:name 和 value
        :return:
        """
        cookie_name = cookie_dict["name"]
        cookie_value = self._web_driver.get_cookie(cookie_name)
        if cookie_value is not None:
            self._web_driver.delete_cookie(cookie_name)

        self._web_driver.add_cookie(cookie_dict)

    def remove_cookie(self, name):
        """
        移除指定 name 的cookie
        :param name:
        :return:
        """
        # 检查 cookie 是否存在,存在就移除
        old_cookie_value = self._web_driver.get_cookie(name)
        if old_cookie_value is not None:
            self._web_driver.delete_cookie(name)

    """
    浏览器本身相关方法
    """

    def refresh(self, url=None):
        """
        刷新页面
        如果 url 是空值,就刷新当前页面,否则就刷新指定页面
        :param url: 默认值是空的
        :return:
        """
        if url is None:
            self._web_driver.refresh()
        else:
            self._web_driver.get(url)

        self.forced_wait(self._wait_seconds)

    def maximize_window(self):
        """
        最大化当前浏览器的窗口
        :return:
        """
        self._web_driver.maximize_window()

    def navigate(self, url):
        """
        打开 URL
        :param url:
        :return:
        """
        self._web_driver.get(url)
        self.forced_wait(self._wait_seconds)

    def quit(self):
        """
        退出驱动
        :return:
        """
        self._web_driver.quit()

    def close_browser(self):
        """
        关闭浏览器
        :return:
        """
        self._web_driver.close()

    """
    基本元素相关方法
    """

    def type(self, selector, text):
        """
        Operation input box.

        Usage:
        driver.type("i,el","selenium")
        """
        el = self._locate_element(selector)
        el.clear()
        el.send_keys(text)

    def click(self, selector):
        """
        It can click any text / image can be clicked
        Connection, check box, radio buttons, and even drop-down box etc..

        Usage:
        driver.click("i,el")
        """
        el = self._locate_element(selector)
        el.click()
        self.forced_wait(self._wait_seconds)

    def click_by_enter(self, selector):
        """
        It can type any text / image can be located  with ENTER key

        Usage:
        driver.click_by_enter("i,el")
        """
        el = self._locate_element(selector)
        el.send_keys(Keys.ENTER)

        self.forced_wait(self._wait_seconds)

    def click_by_text(self, text):
        """
        Click the element by the link text

        Usage:
        driver.click_text("新闻")
        """
        self._locate_element('p%s' % self._by_char + text).click()
        self.forced_wait(self._wait_seconds)

    def submit(self, selector):
        """
        Submit the specified form.

        Usage:
        driver.submit("i,el")
        """
        el = self._locate_element(selector)
        el.submit()

        self.forced_wait(self._wait_seconds)

    def move_to(self, selector):
        """
        to move mouse pointer to selector
        :param selector:
        :return:
        """
        el = self._locate_element(selector)
        ActionChains(self._web_driver).move_to_element(el).perform()
        self.forced_wait(self._wait_seconds)

    def right_click(self, selector):
        """
        to click the selector by the right button of mouse
        :param selector:
        :return:
        """
        el = self._locate_element(selector)
        ActionChains(self._web_driver).context_click(el).perform()
        self.forced_wait(self._wait_seconds)

    def count_elements(self, selector):
        """
        数一下元素的个数
        :param selector: 定位符
        :return:
        """
        els = self._locate_elements(selector)
        return len(els)

    def drag_element(self, source, target):
        """
        拖拽元素
        :param source:
        :param target:
        :return:
        """

        el_source = self._locate_element(source)
        el_target = self._locate_element(target)

        if self._web_driver.w3c:
            ActionChains(self._web_driver).drag_and_drop(el_source,
                                                         el_target).perform()
        else:
            ActionChains(self._web_driver).click_and_hold(el_source).perform()
            ActionChains(self._web_driver).move_to_element(el_target).perform()
            ActionChains(self._web_driver).release(el_target).perform()

        self.forced_wait(self._wait_seconds)

    def lost_focus(self):
        """
        当前元素丢失焦点
        :return:
        """
        ActionChains(self._web_driver).key_down(Keys.TAB).key_up(
            Keys.TAB).perform()
        self.forced_wait(self._wait_seconds)

    """
    <select> 元素相关
    """

    def select_by_index(self, selector, index):
        """
        It can click any text / image can be clicked
        Connection, check box, radio buttons, and even drop-down box etc..

        Usage:
        driver.select_by_index("i,el")
        """
        el = self._locate_element(selector)
        Select(el).select_by_index(index)

        self.forced_wait(self._wait_seconds)

    def get_selected_text(self, selector):
        """
        获取 Select 元素的选择的内容
        :param selector: 选择字符 "i, xxx"
        :return: 字符串
        """
        el = self._locate_element(selector)
        selected_opt = Select(el).first_selected_option()
        return selected_opt.text

    def select_by_visible_text(self, selector, text):
        """
        It can click any text / image can be clicked
        Connection, check box, radio buttons, and even drop-down box etc..

        Usage:
        driver.select_by_index("i,el")
        """
        el = self._locate_element(selector)
        Select(el).select_by_visible_text(text)

        self.forced_wait(self._wait_seconds)

    def select_by_value(self, selector, value):
        """
        It can click any text / image can be clicked
        Connection, check box, radio buttons, and even drop-down box etc..

        Usage:
        driver.select_by_index("i,el")
        """
        el = self._locate_element(selector)
        Select(el).select_by_value(value)

        self.forced_wait(self._wait_seconds)

    """
    JavaScript 相关
    """

    def execute_js(self, script):
        """
        Execute JavaScript scripts.

        Usage:
        driver.js("window.scrollTo(200,1000);")
        """
        self._web_driver.execute_script(script)

        self.forced_wait(self._wait_seconds)

    """
    元素属性相关方法
    """

    def get_value(self, selector):
        """
        返回元素的 value
        :param selector: 定位字符串
        :return:
        """
        el = self._locate_element(selector)
        return el.get_attribute("value")

    def get_attribute(self, selector, attribute):
        """
        Gets the value of an element attribute.

        Usage:
        driver.get_attribute("i,el","type")
        """
        el = self._locate_element(selector)
        return el.get_attribute(attribute)

    def get_text(self, selector):
        """
        Get element text information.

        Usage:
        driver.get_text("i,el")
        """
        el = self._locate_element(selector)
        return el.text

    def get_displayed(self, selector):
        """
        Gets the element to display,The return result is true or false.

        Usage:
        driver.get_display("i,el")
        """
        el = self._locate_element(selector)
        return el.is_displayed()

    def get_selected(self, selector):
        """
        to return the selected status of an WebElement
        :param selector: selector to locate
        :return: True False
        """
        el = self._locate_element(selector)
        return el.is_selected()

    def get_text_list(self, selector):
        """
        根据selector 获取多个元素,取得元素的text 列表
        :param selector:
        :return: list
        """

        el_list = self._locate_elements(selector)

        results = []
        for el in el_list:
            results.append(el.text)

        return results

    """
    窗口相关方法
    """

    def accept_alert(self):
        '''
            Accept warning box.

            Usage:
            driver.accept_alert()
            '''
        self._web_driver.switch_to.alert.accept()

        self.forced_wait(self._wait_seconds)

    def dismiss_alert(self):
        '''
        Dismisses the alert available.

        Usage:
        driver.dismissAlert()
        '''
        self._web_driver.switch_to.alert.dismiss()

        self.forced_wait(self._wait_seconds)

    def switch_to_frame(self, selector):
        """
        Switch to the specified frame.

        Usage:
        driver.switch_to_frame("i,el")
        """
        el = self._locate_element(selector)
        self._web_driver.switch_to.frame(el)

        self.forced_wait(self._wait_seconds)

    def switch_to_default(self):
        """
        Returns the current form machine form at the next higher level.
        Corresponding relationship with switch_to_frame () method.

        Usage:
        driver.switch_to_default()
        """
        self._web_driver.switch_to.default_content()

        self.forced_wait(self._wait_seconds)

    def switch_to_parent(self):
        """
        switch to parent frame
        :return:
        """
        self._web_driver.switch_to.parent_frame()

        self.forced_wait(self._wait_seconds)

    def switch_to_window_by_title(self, title):
        for handle in self._web_driver.window_handles:
            self._web_driver.switch_to.window(handle)
            if self._web_driver.title == title:
                break

            self._web_driver.switch_to.default_content()
            self.forced_wait(self._wait_seconds)

    def open_new_window(self, selector):
        '''
        Open the new window and switch the handle to the newly opened window.

        Usage:
        driver.open_new_window()
        '''
        original_windows = self._web_driver.current_window_handle
        el = self._locate_element(selector)
        el.click()
        all_handles = self._web_driver.window_handles
        for handle in all_handles:
            if handle != original_windows:
                self._web_driver.switch_to.window(handle)
                break

    def save_window_snapshot(self, file_name):
        """
        save screen snapshot
        :param file_name: the image file name and path
        :return:
        """
        driver = self._web_driver
        driver.save_screenshot(file_name)
        self.forced_wait(self._wait_seconds)

    def save_window_snapshot_by_png(self):
        return self._web_driver.get_screenshot_as_png()

    def save_element_snapshot_by_png(self, selector):
        """
        控件截图
        :param selector:
        :return:
        """
        el = self._locate_element(selector)
        self.forced_wait(self._wait_seconds)
        return el.screenshot_as_png

    def save_window_snapshot_by_io(self):
        """
        保存截图为文件流
        :return:
        """
        return self._web_driver.get_screenshot_as_base64()

    def save_element_snapshot_by_io(self, selector):
        """
        控件截图
        :param selector:
        :return:
        """
        el = self._locate_element(selector)
        return el.screenshot_as_base64

    """
    等待方法
    """

    @staticmethod
    def forced_wait(seconds):
        """
        强制等待
        :param seconds:
        :return:
        """
        time.sleep(seconds)

    def implicitly_wait(self, seconds):
        """
        Implicitly wait. All elements on the page.
        :param seconds 等待时间 秒
        隐式等待

        Usage:
        driver.implicitly_wait(10)
        """
        self._web_driver.implicitly_wait(seconds)

    def explicitly_wait(self, selector, seconds):
        """
        显式等待
        :param selector: 定位字符
        :param seconds: 最长等待时间,秒
        :return:
        """
        locator = self._convert_selector_to_locator(selector)

        WebDriverWait(self._web_driver, seconds).until(
            expected_conditions.presence_of_element_located(locator))

    def get_explicitly_wait_element_text(self, selector, seconds):
        """
        显式等待,得到元素的 text
        :param selector: locator
        :param seconds: max timeout sencods
        :return:  str, element.text
        """

        locator = self._convert_selector_to_locator(selector)
        driver = self._web_driver

        el = WebDriverWait(driver,
                           seconds).until(lambda d: d.find_element(*locator))
        if el and isinstance(el, WebElement):
            return el.text

        return None

    """
    属性
    """

    @property
    def current_title(self):
        '''
        Get window title.

        Usage:
        driver.current_title
        '''
        return self._web_driver.title

    @property
    def current_url(self):
        """
        Get the URL address of the current page.

        Usage:
        driver.current_url
        """
        return self._web_driver.current_url
Пример #2
0
class YouTube_Crawler:
    api_key = None
    kwonjun_api_key = None
    kyungsu_api_key = None
    is_driver = False
    IP = #IP
    database = #database
    user = #user
    password = #password

    def __init__(self, api_key=None):
        if api_key is not None:
            self.api_key = api_key

    def make_driver_ready(self):
        options = ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--enable-automation")
        # options.headless = True
        options.add_argument("--disable-gpu")
        options.add_argument("--disable-features=VizDisplayCompositor")
        # options.add_argument('--disable-dev-shm-usage')
        # options.add_argument("disable-gpu")

        self.driver = Chrome(
            executable_path=r"/home/ubuntu/Crawler/chromedriver",
            # self.driver = Chrome(executable_path=r"chromedriver",
            options=options,
        )  # ,chrome_options=options
        self.driver.set_window_size(1920, 1080)
        self.driver.get("https://www.youtube.com/")
        self.driver.implicitly_wait(5)
        self.driver.delete_cookie("PREF")
        self.driver.add_cookie(
            {
                "domain": ".youtube.com",
                "httpOnly": False,
                "name": "PREF",
                "value": "gl=US&hl=en",
                "path": "/",
            }
        )
        self.driver.get("https://www.youtube.com/")
        self.driver.implicitly_wait(5)
        self.is_driver = True

    def pre_process_sql(self, text):
        # temp = bytearray(text.encode('UTF-8'))
        # temp.replace(b'\x00', b'')
        # temp = temp.decode('utf-8', 'ignore')
        # re.sub("\"", " ", temp)
        return re.sub("'", "''", text)

    def pre_process_comment(self, text):
        temp = bytearray(text.encode("UTF-8"))
        temp.replace(b"\x00", b"")
        text = temp.decode("utf-8", "ignore")
        # re.sub("\"", " ", temp)
        return re.sub("'", "''", text)

    def update_video_and_comment(self, video_id):
        if New_YouTube_Crawler_Comment.main(video_id):
            return True
        else:
            return False

    def update_channel_info(self, channel_id, api_set=0):
        if api_set == 0:
            api_key = self.api_key
        elif api_set == 1:
            api_key = self.kwonjun_api_key
        else:
            api_key = self.kyungsu_api_key

        try:
            time.sleep(0.2)
            url = f"""https://www.googleapis.com/youtube/v3/channels?part=statistics&maxResults=50&id={channel_id}&key={api_key}"""

            response = requests.get(url)
            if response.status_code != 200:
                # print("response error: ", url)
                return False

            result = response.json()
            item = dict(*result["items"])

            try:
                check = item["statistics"]["subscriberCount"]
            except:
                conn = pg2.connect(
                    database = self.database,
                    user = self.user,
                    password = self.password,
                    host = self.IP,
                    port = "5432",
                )
                conn.autocommit = False
                cur = conn.cursor()
                sql = f"""UPDATE channel SET hidden_subscriber = true WHERE channel_id = '{channel_id}';"""
                cur.execute(sql)
                conn.commit()
                conn.close()
                return True

            conn = pg2.connect(
                database = self.database,
                user = self.user,
                password = self.password,
                host = self.IP,
                port = "5432",
            )
            conn.autocommit = False
            cur = conn.cursor()

            sql = f"""INSERT INTO channel_subscriber (channel_idx, subscriber_num, check_time)
                    VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['subscriberCount']}', to_timestamp({time.time()}));
                    INSERT INTO channel_views (channel_idx, view_count, check_time)
                    VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['viewCount']}', to_timestamp({time.time()}));"""

            cur.execute(sql)
            conn.commit()
            conn.close()

            return True
        except Exception as e:
            # print(traceback.format_exc())
            # print("ERROR", e)
            return False

    def insert_channel_info(self, channel_id):
        try:
            url = f"""https://www.googleapis.com/youtube/v3/channels?part=id,snippet,contentDetails,statistics,topicDetails&maxResults=50&id={channel_id}&key={self.api_key}"""

            response = requests.get(url)
            if response.status_code != 200:
                # print("response error: ", url)
                return False

            result = response.json()
            item = dict(*result["items"])

            conn = pg2.connect(
                database = database,
                user = user,
                password = password,
                host = self.IP,
                port="5432",
            )
            conn.autocommit = False
            cur = conn.cursor()

            sql = f"""UPDATE channel
                        SET channel_name        = '{self.pre_process_sql(item['snippet']["title"])}',
                            channel_description = '{self.pre_process_sql(item['snippet']['description'])}',
                            channel_start_date  = to_date('{item['snippet']['publishedAt']}', 'YYYY-MM-DD'),
                            upload_id = '{item['contentDetails']['relatedPlaylists']['uploads']}',
                            hidden_subscriber = {item['statistics']['hiddenSubscriberCount']},
                            thumbnail_url = '{item['snippet']['thumbnails']['default']['url']}'
                        WHERE channel_id = '{channel_id}';
        
                    INSERT INTO channel_subscriber (channel_idx, subscriber_num, check_time)
                    VALUES ((SELECT idx from channel where channel.channel_id='{channel_id}'), '{item['statistics']['subscriberCount']}', to_timestamp({time.time()}));"""

            cur.execute(sql)
            conn.commit()
            conn.close()

            return True
        except Exception as e:
            # print(traceback.format_exc())
            # print("ERROR", e)
            return False

    def update_video_info(self, upload_id, interval_day=30, api_set=0):
        if api_set == 0:
            api_key = self.api_key
        elif api_set == 1:
            api_key = self.kwonjun_api_key
        else:
            api_key = self.kyungsu_api_key

        try:
            next_page_token = None
            keep_going = True

            conn = pg2.connect(
                database = self.database,
                user = self.user,
                password = self.password,
                host = self.IP,
                port = "5432",
            )
            conn.autocommit = False
            cur = conn.cursor()

            while keep_going:
                if next_page_token is None:
                    url = f"""https://www.googleapis.com/youtube/v3/playlistItems?part=id,snippet,contentDetails,status&maxResults=50&playlistId={upload_id}&key={api_key}"""
                else:
                    url = f"""https://www.googleapis.com/youtube/v3/playlistItems?part=id,snippet,contentDetails,status&maxResults=50&pageToken={next_page_token}&playlistId={upload_id}&key={api_key}"""

                response = requests.get(url)
                if response.status_code != 200:
                    pass
                    # # print("response error: ", url)
                result = response.json()

                try:
                    next_page_token = result["nextPageToken"]
                except:
                    next_page_token = None
                    keep_going = False

                for items in result["items"]:
                    item = dict(items)

                    try:
                        upload_time = time.strptime(
                            item["contentDetails"]["videoPublishedAt"], "%Y-%m-%dT%H:%M:%SZ",
                        )
                    except:
                        upload_time = time.strptime(
                            item["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"
                        )
                        # # print(upload_time)
                        sql = f"""INSERT INTO video (channel_idx, video_id, upload_time, status)
                                VALUES ((SELECT idx from channel where upload_id = '{upload_id}'),
                                        '{item['contentDetails']['videoId']}', to_timestamp('{item['snippet']['publishedAt']}', 'YYYY-MM-DDTHH24:MI:SSZ'), FALSE)
                                ON CONFLICT DO NOTHING;"""
                        cur.execute(sql)
                        # # print("Disabled Video", item["contentDetails"]["videoId"])
                        if (time.mktime(time.localtime()) - time.mktime(upload_time)) / (
                            60 * 60 * 24
                        ) <= interval_day:
                            pass
                        else:
                            keep_going = False
                            break
                        continue

                    # 90일 이내의 영상
                    # 2020-07-31T12:05:06Z
                    if (time.mktime(time.localtime()) - time.mktime(upload_time)) / (
                        60 * 60 * 24
                    ) <= interval_day:
                        sql = f"""SELECT insert_video('{self.pre_process_sql(item['snippet']['title'])}', '{self.pre_process_sql(item['snippet']['description'])}', 
                                        '{item['contentDetails']['videoId']}', '{item['contentDetails']['videoPublishedAt']}', 
                                        '{upload_id}', '{item['snippet']['thumbnails']['high']['url']}')"""
                        cur.execute(sql)
                        success = cur.fetchone()[0]
                        if not success:
                            keep_going = False
                            break
                    else:
                        keep_going = False
                        break

            conn.commit()
            conn.close()

            return True
        except Exception as e:
            # print(traceback.format_exc())
            # print("ERROR", e)

            return False

    def __del__(self):
        if self.is_driver:
            self.driver.close()