示例#1
0
    def start_driver(self, key, msg=None, captcha=False):
        
        if msg:
            self.logger.info(f"{msg} ffprofile[{key}] driver init")
        opts = Options()
        if captcha: opts.headless = False
        else:
            opts.headless = True
            
        prof_ff = FirefoxProfile(self._FF_PROF[key])
        
        driver = None
        driver = Firefox(options=opts, firefox_profile=prof_ff)
        #driver.maximize_window()
        time.sleep(5)
        try:
            if not captcha: driver.install_addon("/Users/antoniotorres/projects/comic_getter/myaddon/web-ext-artifacts/myaddon-1.0.zip", temporary=True)
            driver.uninstall_addon('@VPNetworksLLC')
        except Exception as e:
            self.logger.warning(f"[start_driver][{key}] {e}")
            

        _title = driver.title
        driver.get("https://readcomiconline.li")
        
        WebDriverWait(driver, 30).until_not(ec.title_is(_title))        #
        driver.add_cookie({'name': 'rco_quality','value': 'hq', 'domain': 'readcomiconline.li', 'path' : '/'})
        driver.add_cookie({'name': 'rco_readType','value': '1', 'domain': 'readcomiconline.li', 'path' : '/'})
        driver.refresh()
        self.logger.debug(f"driver[{key}] cookies: {driver.get_cookies()}")
        return driver
示例#2
0
def load_cookies(victim, driver: webdriver.Firefox):
    sessions = len(victim)
    selection = next(iter(victim.keys()))
    if sessions > 1:
        selection = load_selection_screen(sessions, victim)
    if selection == "s":
        print("[+] Skipping...")
        return False
    for domain in victim[selection]:
        for cookie in domain:
            cookie_obj = {
                "name": cookie.name,
                "value": cookie.value,
                "domain": cookie.domain
            }
            driver.add_cookie(cookie_obj)
    print("[+] Cookies loaded successfully")
示例#3
0
 def load_cookie(self, session_id):
     sessions = json.load(open( self.sessions_file ))
     cookie_path = sessions[str(session_id)]["session_path"]
     url = sessions[str(session_id)]["web_url"]
     # Setting useragent to the same one the session saved with
     useragent = sessions[str(session_id)]["useragent"]
     profile = FirefoxProfile()
     profile.set_preference("general.useragent.override", useragent )
     cookies = pickle.load(open(cookie_path, "rb"))
     try:
         browser = Firefox(profile)
     except:
         error("Couldn't open browser to view session!")
         return
     browser.get(url)
     browser.delete_all_cookies()
     browser.execute_script("window.localStorage.clear()") # clear the current localStorage
     for cookie in cookies:
         browser.add_cookie(cookie)
     status(f"Session {session_id} loaded")
     browser.refresh()
     self.browsers.append(browser)
示例#4
0
 def load_cookie(self, session_id):
     sessions = json.load(open( self.sessions_file ))
     cookie_path = sessions[str(session_id)]["session_path"]
     url = sessions[str(session_id)]["web_url"]
     # Setting useragent to the same one the session saved with
     useragent = sessions[str(session_id)]["useragent"]
     profile = FirefoxProfile()
     profile.set_preference("general.useragent.override", useragent )
     cookies = pickle.load(open(cookie_path, "rb"))
     try:
         browser = Firefox(profile)
     except:
         error("Couldn't open browser to view session!")
         return
     browser.get(url)
     browser.delete_all_cookies()
     browser.execute_script("window.localStorage.clear()") # clear the current localStorage
     for cookie in cookies:
         browser.add_cookie(cookie)
     status(f"Session {session_id} loaded")
     browser.refresh()
     self.browsers.append(browser)
示例#5
0
class AutoPostBot:
    def __init__(self, pagelink):
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = Firefox(
            executable_path='/home/friend/TF2alertbot/geckodriver')
        self.driver.get('https://mbasic.facebook.com')
        self.cookies = pickle.load(open("cookies.pl", "rb"))
        for cookie in self.cookies:
            self.driver.add_cookie(cookie)
        self.driver.get(pagelink)

    def post_image(self, file_path, message):
        time.sleep(1)
        self.driver.find_element_by_name('view_photo').click()
        self.driver.find_element_by_name('file1').send_keys(file_path)
        self.driver.find_element_by_name('add_photo_done').click()
        time.sleep(1)
        for msg in message:
            self.driver.find_element_by_name('xc_message').send_keys(
                msg, Keys.ENTER)
        self.driver.find_element_by_name('view_post').click()
        self.driver.quit()
        self.display.stop()
示例#6
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        ua = UserAgent().random
        request.headers.setdefault('User-Agent', ua)
        # pass

        # chrome_options = Options()
        # chrome_options.add_argument('--headless')  # 使用无头谷歌浏览器模式
        # chrome_options.add_argument('--disable-gpu')
        # chrome_options.add_argument('--no-sandbox')
        # # 指定谷歌浏览器路径
        # self.driver = webdriver.Chrome(chrome_options=chrome_options,
        #                                executable_path='D://BDCloundDown//chromedriver')
        # print("request.url:", request.url)
        # print("request.method:", request.method)
        # if request.method != 'POST':
        #     self.driver.get(request.url)
        #     time.sleep(1)
        #     html = self.driver.page_source
        #     self.driver.quit()
        #     return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',
        #                                     request=request)
        # from_data = {
        #     'UserNameInput': 'LarvwuAX3KTyBFXtXkCcjcFRLzpSb/Ft6P1r29CxZcOlpn9Le8Q+LCQ3iTeXnW2ZdCKJsmA0tOyn4wF4C92vjs1Tg11lGxaroeAgGmSgZvBqyLQha2UNOM/MDHMroF1m9W5j92oe2jg2QPS4rTsCVRsnMcZCCd3y2iY/2/PBtx0=',
        #     'undefined': '0',
        #     'PasswordInput': ' dAaIbU2BmGOtFUVYm/gEM5yaZojqmtjifUJP2N+gkamNFyBqwec5ETXZFcji8orszLywEZPaJ1fQHOvZidQKhWLNtKDqBObcbrXlwgsQuX7ePqYBtP6qAc5JIQ/tfPcPYT6S0s4cCdAWGzyitt/L0jqf27XCael00UjFFLDswAU=',
        #     'IsKeepLoginState': 'true',
        #     'loginType': 'PC',
        # }
        #
        # return scrapy.FormRequest(url='https://passport.ch.com/zh_cn/Login/DoLogin',
        #                           formdata=from_data,
        #                           callback=ChunqiuSpider.islogin)

        if spider.name == 'chunqiu':
            if 'AirFlights/ActivitiesSecondKill' in request.url:
                options = Options()
                options.add_argument('-headless')

                # geckodriver需要手动下载
                driver = Firefox(executable_path=
                                 'C:\\Users\\win7\\Desktop\\temp\\geckodriver',
                                 firefox_options=options)
                # driver.manage().addCookie(new Cookie("Name", "value", "域名", "/", 生效时间, false, false));

                driver.get(request.url)

                time.sleep(3)

                # driver.add_cookie(
                #     self.ReadTxtName('./cookie.txt')
                # )
                driver.add_cookie({
                    'name': 'limeimei',
                    'value': '222',
                    'domain': 'ch.com'
                })
                driver.add_cookie({
                    'name': 'limeimei',
                    'value': '222',
                    'domain': 'pages.ch.com'
                })

                print(driver.get_cookies())
                time.sleep(30)
                driver.get(request.url)
                print(driver.get_cookies())
                # searchText = driver.find_element_by_xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a')[0].text
                #  search_results = WebDriverWait(driver,10).until(
                #      # lambda d: d.find_elements_by_xpath('//h3[@class="t c-title-en"] | //h3[@class="t"]')
                #      lambda e: e.find_elements_by_xpath('//div[contains(@class,"threadlist_title pull_left j_th_tit ")]/a[1]')
                #  )
                #  for item in search_results:
                #      print(item.text)  # 打印搜索内容的标题

                html = driver.page_source
                driver.quit()

                # 构建response, 将它发送给spider引擎
                return HtmlResponse(url=request.url,
                                    body=html,
                                    request=request,
                                    encoding='utf-8')
示例#7
0
class Instagram():
    def __init__(self):
        self.opts = Options()
        self.opts.set_headless()
        assert self.opts.headless
        self.browser = Firefox(options=self.opts)
        self.url = "https://instagram.com"

        self.username = ""
        self.password = ""

        self.collected_info = {}
        self.collected_info['username'] = []
        self.collected_info['biography'] = []
        self.collected_info['external_url'] = []
        self.collected_info['followers'] = []
        self.collected_info['followed'] = []
        self.collected_info['full_name'] = []
        self.collected_info['profile_pic'] = []
        self.collected_info['fbid'] = []
        self.collected_info['is_private'] = []

        self.followers_c = 0
        self.followers_list = []
        self.all_followers = []
        self.followers_count = 0

        self.following_count = 0
        self.following_c = 0
        self.following_list = []
        self.all_following = []

        self.variables = {}
        self.variables["first"] = 50
        self.variables["id"] = 0

        self.graphql_endpoint = "view-source:https://www.instagram.com/graphql/query/"
        self.graphql_followers = (
            self.graphql_endpoint +
            "?query_hash=37479f2b8209594dde7facb0d904896a")
        self.graphql_following = (
            self.graphql_endpoint +
            "?query_hash=58712303d941c6855d4e888c5f0cd22f")

        self.login()

    def login(self):
        print("[INFO] Autorization")
        self.browser.get(self.url)
        try:
            for cookie in pickle.load(open("Session.pkl", "rb")):
                self.browser.add_cookie(cookie)
            print("[INFO] Session has been restored")
        except:
            print("[WARNING] Session cant be restored")

            self.username = input("Please enter username: "******"Please enter password: "******"input[name='username']")))
            password = WebDriverWait(self.browser, 10).until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "input[name='password']")))
            try:
                try:
                    username.clear()
                    username.send_keys(self.username)
                    print("[INFO]Username successfuly entered")
                except:
                    print("[ERROR] Username error")
                try:
                    password.clear()
                    password.send_keys(self.password)
                    print("[INFO] Password successfuly entered")
                except:
                    print("[ERROR] Password error")
                try:
                    Login_button = WebDriverWait(self.browser, 2).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR,
                             "button[type='submit']"))).click()
                    print("[INFO]Successfuly logged in")
                except:
                    print("[ERROR] Login error")
                try:
                    not_now = WebDriverWait(self.browser, 10).until(
                        EC.element_to_be_clickable((
                            By.XPATH,
                            "//button[contains(text(), 'Not Now')]"))).click()
                    print("[INFO] Login info dont saved")
                except:
                    print("[ERROR] Login info popup error")
                try:
                    not_now2 = WebDriverWait(self.browser, 10).until(
                        EC.element_to_be_clickable((
                            By.XPATH,
                            "//button[contains(text(), 'Not Now')]"))).click()
                    print("[INFO] Notifications dont turn on")
                except:
                    print("[ERROR] Notifications popup error")
                try:
                    pickle.dump(self.browser.get_cookies(),
                                open("Session.pkl", "wb"))
                    print("[INFO] Session has been saved")
                except:
                    print("[WARNING] Cant save session")
            except NoSuchElementException:
                print("[WARNING] Wrongusername or password")

    def get_id(self):
        try:
            user_id = self.browser.execute_script(
                "return window.__additionalData[Object.keys(window.__additionalData)[0]].data.graphql.user.id"
            )
        except WebDriverException:
            user_id = self.browser.execute_script(
                "return window._sharedData.entry_data.ProfilePage[0].graphql.user.id"
            )
        return user_id

    def progress(self, count, total, status=''):
        bar_len = 60
        filled_len = int(round(bar_len * count / float(total)))

        percents = round(100.0 * count / float(total), 1)
        bar = '=' * filled_len + '-' * (bar_len - filled_len)

        sys.stdout.write('[%s] %s%s Collecting %s\r' %
                         (bar, percents, '%', status))
        sys.stdout.flush()

    def collect_info(self, username):
        print("[INFO] Collecting information")
        try:
            self.browser.get("view-source:https://www.instagram.com/" +
                             username + "/?__a=1")
        except:
            print("[ERROR] Information error with open page")
        try:
            pre = self.browser.find_element_by_tag_name("pre").text
        except:
            print("[ERROR] with Information find element pre")
        try:
            self.data = json.loads(pre)["graphql"]
        except:
            print("[ERROR]", 'graphql')
        try:
            biography = self.data["user"]["biography"]
        except:
            print("[ERROR]", 'biography')
        try:
            external_url = self.data["user"]["external_url"]
        except:
            print("[ERROR]", 'external_url')
        try:
            followers = self.data["user"]["edge_followed_by"]['count']
        except:
            print("[ERROR]", 'followers')
        try:
            followed = self.data["user"]["edge_follow"]['count']
        except:
            print("[ERROR]", 'followed')
        try:
            full_name = self.data["user"]["full_name"]
        except:
            print("[ERROR]", 'full name')
        try:
            profile_pic = self.data["user"]["profile_pic_url_hd"]
        except:
            print("[ERROR]", 'profile pic')
        try:
            fbid = self.data["user"]["fbid"]
        except:
            print("[ERROR]", 'fbid')
        try:
            is_private = self.data["user"]["is_private"]
        except:
            print("[ERROR]", 'is_private')
        try:
            self.collected_info['username'] = username
        except:
            print("[ERROR]", 'append username')
        try:
            self.collected_info['biography'] = biography
        except:
            print("[ERROR]", '= biography')
        try:
            self.collected_info['external_url'] = external_url
        except:
            print("[ERROR]", '= external url')
        try:
            self.collected_info['followers'] = followers
        except:
            print("[ERROR]", '= followers')
        try:
            self.collected_info['followed'] = followed
        except:
            print("[ERROR]", '= followed')
        try:
            self.collected_info['full_name'] = full_name
        except:
            print("[ERROR]", '= full name')
        try:
            self.collected_info['profile_pic'] = profile_pic
        except:
            print("[ERROR]", '= profile pic')
        try:
            self.collected_info['fbid'] = fbid
        except:
            print("[ERROR]", "fbid")
        try:
            self.collected_info['is_private'] = is_private
        except:
            print("[ERROR]", "append is_private")
        try:
            collected_info = self.collected_info
        except:
            print("[ERROR]", 'collected info')
            self.collected_info = []
        # print(collected_info)
        return collected_info

    def next_page_followers(self, id):
        try:
            time.sleep(5)
            pre = self.browser.find_element_by_tag_name("pre").text
            self.data = json.loads(pre)["data"]
            page_info = self.data["user"]["edge_followed_by"]["page_info"]
            self.variables["id"] = id
            self.variables["after"] = page_info["end_cursor"]
            url = "{}&variables={}".format(self.graphql_followers,
                                           str(json.dumps(self.variables)))
            self.browser.get(url)
        except:
            print('[ERROR] With open next page followers')

    def get_followers(self):
        try:
            pre = self.browser.find_element_by_tag_name("pre").text
            data = json.loads(pre)["data"]
            # print(pre)
            page_info = data["user"]["edge_followed_by"]["page_info"]
            edges = data["user"]["edge_followed_by"]["edges"]
            self.followers_count = data["user"]["edge_followed_by"]["count"]
            all_followers = []
            for user in edges:
                self.followers_c += 1
                all_followers.append(user["node"]["username"])
            return all_followers
        except:
            print('[ERROR] With get followers')

    def collect_followers(self, username):
        try:
            self.browser.get(self.url + '/' + username)
        except:
            print("[ERROR] With opening page of", username)
        try:
            followers_list = []
            all_followers = []
            self.variables["id"] = self.get_id()
            self.i = 0
            self.sc_rolled = 0
            self.variables["first"] = 50
        except:
            print("[ERROR] with set vars of followers")
        print("[INFO] Collecting followers of", username)

        try:
            followers_url = "{}&variables={}".format(
                self.graphql_followers, str(json.dumps(self.variables)))
            self.browser.get(followers_url)
        except:
            print("[ERROR] With opening followers page")

        pre = self.browser.find_element_by_tag_name("pre").text
        self.data = json.loads(pre)["data"]
        self.followers_count = self.data["user"]["edge_followed_by"]["count"]
        self.flag = True
        i = 1
        try:
            print("[INFO] Followers count:", self.followers_count)
            while self.flag:
                users = self.get_followers()
                self.next_page = self.data["user"]["edge_followed_by"][
                    "page_info"]["has_next_page"]
                followers_list.append(users)
                self.progress(self.followers_c, self.followers_count,
                              "Followers, page:" + str(i))
                iter = self.followers_count / self.variables["first"]
                if i < iter:
                    if self.sc_rolled > 10:
                        print("[INFO] Queried too much! ~ sleeping a bit :>")
                        time.sleep(60)
                        self.sc_rolled = 0
                    self.sc_rolled += 1
                    i += 1
                    self.next_page_followers(self.variables["id"])
                else:
                    self.flag = False
        except:
            print("[ERROR] With np")
        print("[INFO] followerscount:", self.followers_count)
        print("[INFO] followers saved:", self.followers_c)
        if (self.followers_count != self.following_c):
            print("[WARNING] Not all followers has been saved")
        for foll in followers_list:
            for f in foll:
                all_followers.append(f)
        self.followers_c = 0
        return all_followers

    def next_page_following(self, id):
        try:
            time.sleep(5)
            pre = self.browser.find_element_by_tag_name("pre").text
            self.data = json.loads(pre)["data"]
            page_info = self.data["user"]["edge_follow"]["page_info"]
            self.variables["id"] = id
            self.variables["after"] = page_info["end_cursor"]
            url = "{}&variables={}".format(self.graphql_following,
                                           str(json.dumps(self.variables)))
            self.browser.get(url)
        except:
            print('[ERROR] With next page following')

    def get_following(self):
        try:
            pre = self.browser.find_element_by_tag_name("pre").text
            data = json.loads(pre)["data"]
            # print(data)
            page_info = data["user"]["edge_follow"]["page_info"]
            edges = data["user"]["edge_follow"]["edges"]
            self.following_count = data["user"]["edge_follow"]["count"]
            all_following = []
            for user in edges:
                self.following_c += 1
                all_following.append(user["node"]["username"])
            return all_following
        except:
            print('[ERROR] With get following')

    def collect_following(self, username):
        try:
            self.browser.get(self.url + '/' + username)
        except:
            print("[ERROR] With opening page of", username)
        try:
            following_list = []
            all_following = []
            self.variables["id"] = self.get_id()
            self.i = 0
            self.sc_rolled = 0
        except:
            print("[ERROR] With following vars")
        print("[INFO] Collecting following of", username)

        try:
            following_url = "{}&variables={}".format(
                self.graphql_following, str(json.dumps(self.variables)))
            self.browser.get(following_url)
        except:
            print("[ERROR] with opening following page")

        pre = self.browser.find_element_by_tag_name("pre").text
        self.data = json.loads(pre)["data"]
        self.following_count = self.data["user"]["edge_follow"]["count"]
        self.flag = True
        i = 1
        try:
            print("[INFO] Following by:", self.following_count)
            while self.flag:
                users = self.get_following()
                self.next_page = self.data["user"]["edge_follow"]["page_info"][
                    "has_next_page"]
                following_list.append(users)
                self.progress(self.following_c, self.following_count,
                              "Followings, page:" + str(i))
                iter = self.following_count / self.variables["first"]
                if i < iter:
                    if self.sc_rolled > 10:
                        print("[INFO] Queried too much! ~ sleeping a bit :>")
                        time.sleep(60)
                        self.sc_rolled = 0
                    self.sc_rolled += 1
                    i += 1
                    self.next_page_following(self.variables["id"])
                else:
                    self.flag = False
        except:
            print("[ERROR] with np following")
        print("[INFO] following count:", self.following_count)
        print("[INFO] following saved:", self.following_c)
        if (self.following_count != self.following_c):
            print("[WARNING] Not all following has been saved")
        for foll in following_list:
            for f in foll:
                all_following.append(f)
        self.following_c = 0
        return all_following
class GetCompanyInfo(object):
    """
    爬取天眼查下的企业的信息
    """
    def __init__(self):
        """
        初始化爬虫执行代理,使用firefox访问
        """
        self.username = ''
        self.password = ''
        self.options = webdriver.FirefoxOptions()
        self.options.add_argument('-headless')  # 无头参数
        self.geckodriver = r'geckodriver'
        self.driver = Firefox(executable_path=self.geckodriver,
                              firefox_options=self.options)

        self.start_url = 'https://www.tianyancha.com'

    def test(self):
        """
        调试专用
        :return:
        """
        start_url = ''
        self.driver.get(start_url)

        for k, v in cookies.items():
            self.driver.add_cookie({'name': k, 'value': v})
        time.sleep(1)
        print(self.driver.page_source)
        self.driver.close()

    def login(self):
        """
        登录并检查状态
        :return:
        """
        try:
            self.driver.get(self.start_url)

            print(self.driver.get_cookies())

            username = self.index_login()
            username_pattern = username[:3] + ' **** ' + username[-4:]
            print(username_pattern)
            page = self.driver.page_source
            is_login = page.find(username_pattern)

            print(is_login)
            if is_login != -1:
                print('登录成功')
        except Exception as e:
            print(e)

    def index_login(self):
        """
        主页下的登录模式
        :return:
        """
        get_login = self.driver.find_elements_by_xpath(
            '//a[@class="media_port"]')[0]  # 登录/注册
        print(get_login.text)
        # url为login的input
        get_login.click()
        login_by_pwd = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div[2]/div')  # 切换到手机登录
        print(login_by_pwd.text)
        login_by_pwd.click()
        input1 = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[2]/input')  # 手机号码

        input2 = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[3]/input')  # 密码
        print(input1.get_attribute('placeholder'))
        print(input2.get_attribute('placeholder'))

        username, password = self._check_user_pass()
        input1.send_keys(username)
        input2.send_keys(password)

        login_button = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[5]')  # 点击登录
        print(login_button.text)
        time.sleep(1)  # 必须等待否则鉴别是爬虫
        login_button.click()
        return username

    def _check_user_pass(self):
        """
        检查是否有帐号密码
        :return:
        """
        if self.username and self.password:
            return self.username, self.password
        else:
            username = input('输入您的手机号码\n')
            password = input('输入您的密码\n')
            return username, password

    def login_page_login(self):
        """
        url:www.tianyancha.com/login
        在这个url下的登录模式
        :return:
        """
        input1 = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[2]/input')

        input2 = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[3]/input')
        print(input1.get_attribute('placeholder'))
        input1.send_keys("")
        print(input2.get_attribute('placeholder'))
        input2.send_keys('')

        login_button = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[5]')

        print(login_button.text)
        time.sleep(1)
        login_button.click()

    def get_company_info(self, company_name, company_onwer):
        """
        获取想要的公司信息
        :param company_name:
        :param company_onwer:
        :return:
        """
        try:
            time.sleep(1)
            index_input_company = self.driver.find_element_by_xpath(
                '//input[@id="home-main-search"]')  # 主页搜索框

            index_input_company.send_keys(company_name)
            self.driver.find_element_by_xpath(
                '//div[contains(@class, "input-group-addon")'
                ' and contains(@class, "search_button")'
                ' and contains(@class, " white-btn")'
                ']').click()  # 点击搜索
            # button_name = find_company_button.find_element_by_xpath('//span').text    # span中的文本应该为【天眼一下】
            # print(button_name)

            # time.sleep(1)
            company_list = self.driver.find_elements_by_xpath(
                '//div[contains(@class, "b-c-white")'
                ' and contains(@class, "search_result_container")'
                ']/div')  # 获取当前页面所有公司的div
            company_info = list()
            for each_company in company_list:
                company_name_from_web = each_company.find_element_by_tag_name(
                    'img').get_attribute('alt')
                company_url = each_company.find_element_by_tag_name(
                    'a').get_attribute('href')
                company_reg_money = each_company.\
                    find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(2) span').text
                company_reg_time = each_company.\
                    find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(3) span').text
                company_score = each_company.find_element_by_css_selector(
                    '.c9.f20').text
                company_info.append([
                    company_name_from_web, company_url, company_reg_money,
                    company_reg_time, company_score + '分'
                ])  # 获取URL
                print(company_info[-1])

            print('当前匹配公司数:', len(company_info))
            if company_info:
                for each_list in company_info:
                    if each_list[0] == company_name:
                        return '爬取成功: ' + str(each_list)
                        # self.driver.get(each_list[1])     # 进入公司详情页
                        # score = self.driver.find_element_by_class_name('td-score-img').get_attribute('alt')
                        # print(score)
                return '爬取成功'
            else:
                return '爬取失败'
        except Exception as e:
            print(e)

    def main(self):

        self.login()
        msg = self.get_company_info('*****软件有限公司', '')
        print(msg)
        print('crawl finish...')

        self.driver.close()
示例#9
0
class SeleniumClass(object):
    def __init__(self,
                 v_bShow=False,
                 v_WorkingPath='',
                 v_UseCookies=False,
                 v_Engine='Chrome',
                 v_EngineDriver=None):
        LOG_FORMAT = ('%(levelname) -5s %(asctime)s %(name) -20s %(funcName) '
                      '-25s %(lineno) -5d: %(message)s')

        logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

        self.StepTimeout = 15
        self.WaitTimeout = 2
        self.CookiesFile = ''
        self.UseCookies = v_UseCookies
        self.Engine = v_Engine
        self.EngineDriver = v_EngineDriver
        self.EngineDriverPath = ''

        if (v_WorkingPath != ''):
            LOGGER.info('WorkingPath: ' + os.path.realpath(v_WorkingPath))
            if (self.Engine == 'IE'):
                self.CookiesFile = os.path.realpath(
                    v_WorkingPath) + '\\cookies.pkl'
            else:
                self.CookiesFile = os.path.realpath(
                    v_WorkingPath) + '/cookies.pkl'

        if (self.Engine == 'Chrome'):
            self.EngineDriver = '/usr/local/sbin/chromedriver'
            opts = ChOptions()
            opts.add_argument("binary_location = '/usr/bin/'")
        elif (self.Engine == 'IE'):
            opts = IeOptions()
            opts.add_argument(
                "binary_location = 'C:\\Program Files (x86)\\Internet Explorer'"
            )
        elif (self.Engine == 'Firefox'):
            self.EngineDriver = '/usr/local/sbin/geckodriver'
            self.EngineDriver = '/usr/local/sbin'
            opts = GkOptions()
            opts.add_argument("binary_location = '/usr/bin/'")

        LOGGER.info('Engine: ' + self.Engine)
        LOGGER.info('EngineDriver: ' + self.EngineDriver)

        self.EngineDriverPath = os.path.dirname(
            os.path.abspath(self.EngineDriver))
        sys.path.insert(0, self.EngineDriverPath)

        opts.add_argument("--start-maximized")
        opts.add_argument("--enable-automation")
        opts.add_argument("--log-level=3")
        opts.add_argument("--silent")
        opts.add_argument("--disable-infobars")
        opts.add_argument("--disable-dev-shm-usage")
        opts.add_argument("--disable-browser-side-navigation")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
        opts.add_argument("--no-zygote")
        if (not v_bShow):
            LOGGER.info('Headless Operation')
            opts.add_argument("--headless")
            opts.add_argument("--disable-setuid-sandbox")

        if (self.Engine == 'Chrome'):
            self.Browser = ChWebBrowser(self.EngineDriver, options=opts)
        elif (self.Engine == 'IE'):
            self.Browser = IeWebBrowser(self.EngineDriver, options=opts)
        elif (self.Engine == 'Firefox'):
            self.Browser = GkWebBrowser(self.EngineDriver, options=opts)

        if (self.UseCookies):
            try:
                if ((self.CookiesFile != '')
                        and (os.path.isfile(self.CookiesFile))):
                    for cookie in pickle.load(open(self.CookiesFile, "rb")):
                        self.Browser.add_cookie(cookie)
                    LOGGER.info('Cookies Loaded')
            except Exception as Exc:
                LOGGER.info('Could Not Load Cookies ' + self.CookiesFile +
                            ' - (' + str(Exc).strip() + ')')

        self.Browser.set_window_size(1920, 1080)
        self.Browser.set_window_position(0, 0)

    def SwitchContext(self,
                      v_ObjIdentity,
                      v_TypeOfIdentity='ID',
                      v_TimeOut=None):
        if (v_TimeOut is None):
            v_TimeOut = self.StepTimeout

        v_TimeOut = 0.5

        LOGGER.info("ObjIdentity: " + str(v_ObjIdentity) + " (" +
                    str(v_TypeOfIdentity).strip().upper() + ")" +
                    "; TimeOut: " + str(v_TimeOut))

        CtxReturn = False

        try:
            if (type(v_ObjIdentity) == int):
                self.Browser.switch_to.frame(v_ObjIdentity)
                CtxReturn = True
            else:
                if ((str(v_ObjIdentity).strip().upper() == 'DEFAULT')
                        or (str(v_ObjIdentity).strip().upper() == 'MAIN')):
                    self.Browser.switch_to.default_content()
                    CtxReturn = True
                else:
                    frmElm = self.GetElement(v_ObjIdentity, v_TypeOfIdentity,
                                             v_TimeOut)
                    if (frmElm is False):
                        LOGGER.info('iFrame "' + v_ObjIdentity +
                                    '" not founded')
                        CtxReturn = False
                        raise TimeoutException
                    else:
                        self.Browser.switch_to.frame(frmElm)
                        CtxReturn = True
        except Exception as Exc:
            LOGGER.info('Could Switch to iFrame (' + str(Exc).strip() + ')')
            CtxReturn = False

        return CtxReturn

    def FindElement(self,
                    v_ObjIdentity,
                    v_TypeOfIdentity='ID',
                    v_TimeOut=None):
        if (v_TimeOut is None):
            v_TimeOut = self.StepTimeout

        bReturn = False

        LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                    v_TypeOfIdentity.strip().upper() + ")" + "; TimeOut: " +
                    str(v_TimeOut))

        wait = WebDriverWait(self.Browser, v_TimeOut)

        if (v_TypeOfIdentity.strip().upper() == 'JSID'):
            try:
                for IdTime in range(v_TimeOut - 1):
                    JsCode = ("document.getElementById('" + v_ObjIdentity +
                              "');")
                    JsReturn = self.ExecJsScript(JsCode, True)
                    if 'webdriver.remote.webelement.WebElement' in JsReturn:
                        break
                    time.sleep(1)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'JSXPATH'):
            try:
                for IdTime in range(v_TimeOut - 1):
                    JsCode = ("document.evaluate(" + v_ObjIdentity + ", " +
                              "document, null, " +
                              "XPathResult.FIRST_ORDERED_NODE_TYPE, null" +
                              ");")
                    JsReturn = self.ExecJsScript(JsCode, True)
                    if 'webdriver.remote.webelement.WebElement' in JsReturn:
                        break
                    time.sleep(1)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'JSCLASS'):
            try:
                for IdTime in range(v_TimeOut - 1):
                    JsCode = ("document.getElementsByClassName('" +
                              v_ObjIdentity + "')[0];")
                    JsReturn = self.ExecJsScript(JsCode, True)
                    if 'webdriver.remote.webelement.WebElement' in JsReturn:
                        break
                    time.sleep(1)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'JSNAME'):
            try:
                for IdTime in range(v_TimeOut - 1):
                    JsCode = ("document.getElementsByName('" + v_ObjIdentity +
                              "')[0];")
                    JsReturn = self.ExecJsScript(JsCode, True)
                    if 'webdriver.remote.webelement.WebElement' in JsReturn:
                        break
                    time.sleep(1)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'JSCSS'):
            try:
                for IdTime in range(v_TimeOut - 1):
                    JsCode = ("document.querySelectorAll('" + v_ObjIdentity +
                              "')[0];")
                    JsReturn = self.ExecJsScript(JsCode, True)
                    if 'webdriver.remote.webelement.WebElement' in JsReturn:
                        break
                    time.sleep(1)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'ID'):
            try:
                wait.until(
                    expected_conditions.visibility_of_element_located(
                        (By.ID, v_ObjIdentity)))
                self.Browser.find_element_by_id(v_ObjIdentity)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'XPATH'):
            try:
                wait.until(
                    expected_conditions.visibility_of_element_located(
                        (By.XPATH, v_ObjIdentity)))
                self.Browser.find_element_by_xpath(v_ObjIdentity)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'CLASS'):
            try:
                wait.until(
                    expected_conditions.visibility_of_element_located(
                        (By.CLASS_NAME, v_ObjIdentity)))
                self.Browser.find_element_by_class_name(v_ObjIdentity)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'NAME'):
            try:
                wait.until(
                    expected_conditions.visibility_of_element_located(
                        (By.NAME, v_ObjIdentity)))
                self.Browser.find_element_by_name(v_ObjIdentity)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True
        elif (v_TypeOfIdentity.strip().upper() == 'CSS'):
            try:
                wait.until(
                    expected_conditions.visibility_of_element_located(
                        (By.CSS_SELECTOR, v_ObjIdentity)))
                self.Browser.find_element_by_css_selector(v_ObjIdentity)
            except NoSuchElementException:
                bReturn = False
            except TimeoutException:
                bReturn = False
            bReturn = True

        if (bReturn):
            LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                        v_TypeOfIdentity.strip().upper() + "); " + "FOUNDED")
        else:
            LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                        v_TypeOfIdentity.strip().upper() + "); " +
                        "NOT FOUNDED")

        return bReturn

    def GetElement(self, v_ObjIdentity, v_TypeOfIdentity='ID', v_TimeOut=None):
        if (v_TimeOut is None):
            v_TimeOut = self.StepTimeout

        LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                    v_TypeOfIdentity.strip().upper() + "); " + "TimeOut: " +
                    str(v_TimeOut))

        ElmReturn = False

        try:
            if (self.FindElement(v_ObjIdentity, v_TypeOfIdentity, v_TimeOut)):
                if ((v_TypeOfIdentity.strip().upper() == 'ID')
                        or (v_TypeOfIdentity.strip().upper() == 'JSID')):
                    ElmReturn = self.Browser.find_element_by_id(v_ObjIdentity)
                elif (v_TypeOfIdentity.strip().upper() == 'XPATH'):
                    ElmReturn = self.Browser.find_element_by_xpath(
                        v_ObjIdentity)
                elif (v_TypeOfIdentity.strip().upper() == 'CLASS'):
                    ElmReturn = self.Browser.find_element_by_class_name(
                        v_ObjIdentity)
                elif (v_TypeOfIdentity.strip().upper() == 'NAME'):
                    ElmReturn = self.Browser.find_element_by_name(
                        v_ObjIdentity)
                elif (v_TypeOfIdentity.strip().upper() == 'CSS'):
                    ElmReturn = self.Browser.find_element_by_css_selector(
                        v_ObjIdentity)
            else:
                ElmReturn = False
        except NoSuchElementException:
            ElmReturn = False
        except TimeoutException:
            ElmReturn = False

        if (ElmReturn):
            LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                        v_TypeOfIdentity.strip().upper() + "); " +
                        "ELEMENT OK")
        else:
            LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" +
                        v_TypeOfIdentity.strip().upper() + "); " +
                        "ELEMENT NOT OK")

        return ElmReturn

    def ExecJsScript(self, v_JsScript, v_ReturnValue=True, v_Verbose=False):
        if (v_ReturnValue):
            JsScript = 'return ' + v_JsScript
        else:
            JsScript = v_JsScript

        if (v_Verbose):
            LOGGER.info("ExecJsScript: " + JsScript)

        JsReturn = str(self.Browser.execute_script(JsScript))

        if (v_ReturnValue):
            if (v_Verbose):
                LOGGER.info(JsReturn)

            return JsReturn
示例#10
0
class Browser:
    max_wait = 10

    def __init__(self, name, headless=False):
        self.name = name
        self.headless = headless
        self.username = None
        self.start()

    def start(self):
        self.log('starting')
        options = Options()
        if self.headless:
            options.add_argument('--headless')
        self.driver = Firefox(options=options)
        self.elem = None
        self.log('started')

    def get(self, url):
        self.driver.get(url)

    def maximize(self):
        self.driver.maximize_window()
        self.log('maximize')

    def js(self, js):
        out = self.driver.execute_script(js)
        self.log('js', out=out)

    def bottom(self):
        self.js('window.scrollTo(0, document.body.scrollHeight);')

    def size(self, width=800, height=600):
        self.driver.set_window_size(width, height)
        self.log(f'width: {width}, height: {height}')

    def user(self):
        self.username = input('username: '******'password: '******'{self.name}.pkl', 'wb') as f:
            pickle.dump(cookies, f)
        self.log('save loaded')

    def load_cookies(self):
        with open(f'{self.name}.pkl', 'rb') as f:
            cookies = pickle.load(f)
            for cookie in cookies:
                self.driver.add_cookie(cookie)
        self.log('cookies loaded')

    def log(self, message, **kwargs):
        print(f'browser: {message}', kwargs)

    def html(self):
        html = self.driver.page_source
        self.log(html)

    def done(self):
        self.log('closing')
        self.elem = None
        self.username = None
        self.password = None
        self.driver.close()
        self.log('done')

    def pause(self, seconds):
        self.log('sleep', seconds=seconds)
        time.sleep(seconds)

    def find(self, selector):
        self.log('finding', selector=selector)
        wait = WebDriverWait(self.driver, self.max_wait)
        self.elem = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
        self.log('found', elem=self.elem)

    def type(self, value):
        self.elem.send_keys(value)
        if value == self.password:
            self.log('type password')
        else:
            self.log(f'type: {value}')

    def click(self):
        self.elem.click()
        self.log('click')

    def enter(self):
        self.type(Keys.ENTER)

    def screenshot(self, name, show=False):
        image = Image.open(BytesIO(self.elem.screenshot_as_png))
        fname = f'./{name}.png'
        image.save(fname)
        self.log(fname)
        if show:
            image.show()
示例#11
0
def copy_cookies(fromd: webdriver.Firefox,
                 tod: webdriver.Firefox,
                 clear=False):
    if clear: tod.delete_all_cookies()
    for cookie in fromd.get_cookies():
        tod.add_cookie(cookie)
示例#12
0
class Account():
    def __init__(self, signal, lock, name, people, headless=True, debug=False):
        # Variables
        self.start = time()
        self.convLists = ['Conversation List',
                          'Λίστα συζητήσεων']  # Add your language here
        self.name = name
        self.people = people
        self.lock = lock
        self.signal = signal
        self.path = dirname(realpath(__file__))
        self.cookies = f'{self.path}/cookies/{self.name}_cookies.pkl'
        self.mediapath = f'{self.path}/media/{self.name}'
        self.logpath = f'{self.path}/media/{self.name}/last.log'
        self.debug = debug
        self.logFile = f'{self.path}/logs/{self.name}_{strftime("%d-%m-%y-%H.%M.%S")}.log'
        self.url_messages = 'https://www.facebook.com/messages/t/'
        self.url_home = 'https://facebook.com'
        self.timeout = 1
        self.iter_L = 60
        self.iter = self.iter_L
        self.maxBack = 30
        self.faults = 0
        self.faultThreashold = 3

        # Initialize Gecko driver
        agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36'
        profile = FirefoxProfile()
        profile.set_preference('general.useragent.override', agent)
        options = Options()
        options.headless = headless
        self.driver = Firefox(profile, options=options)

        # Make folders
        self.makeFolder(f'{self.path}/logs')
        self.makeFolder(self.mediapath)
        for person in self.people:
            self.makeFolder(f'{self.mediapath}/{person}')

    def getToMessages(self):
        # Points driver to url_messages
        if self.url_messages != self.driver.current_url:
            self.driver.get(self.url_messages)
            self.waitU('_1enh')

    def getToHome(self):
        # Points driver to url_home
        self.driver.get(self.url_home)
        self.waitU('facebook')

    def waitU(self, element_id):
        # Waits until the element element_id is available
        try:
            element_present = EC.presence_of_element_located(
                (By.ID, element_id))
            WebDriverWait(self.driver, self.timeout).until(element_present)
        except TimeoutException:
            pass

    def getConversations(self, people):
        # Locates element of conversation
        self.logIt(f'Get to conversation with list {people}')
        for convList in self.convLists:
            if convList in self.driver.page_source:
                conversations = self.driver.find_elements_by_xpath(
                    f'//ul[@aria-label="{convList}"]//li[@tabindex="-1"]')
        conv = []
        for i in conversations:
            person = findall(
                'data-href="https://www.facebook.com/messages/t/(.*?)"',
                i.get_attribute('innerHTML'))[0]
            if person in people:
                conv.append(i)
        return conv

    def getToThread(self, num):
        # Wrapper to download media
        self.logIt(f'Getting to thread, media {num}')
        self.waitU('_3m31')
        threads = self.driver.find_elements_by_xpath('//a[@class="_3m31"]')
        thread = threads[num]
        if self.existsMedia(thread.get_attribute('innerHTML')):
            return
        thread.click()
        self.waitU('_4-od')
        self.getMedia()
        thread.send_keys(Keys.ESCAPE)
        return len(threads)

    def login(self, username, password):
        # Logs in to account, USE WITH CAUTION
        self.logIt(
            'Trying to login, USE WITH CAUSION!! (sleeping for 10 secs)')
        sleep(10)
        driver = self.driver
        self.getToHome()
        elem = driver.find_element_by_id('email')
        elem.send_keys(username)
        elem = driver.find_element_by_id('pass')
        elem.send_keys(password)
        elem.send_keys(Keys.RETURN)
        self.waitU('userNav')
        self.saveCookies()

    def saveCookies(self):
        # Saves the cookies
        self.logIt(f'Saving cookies')
        self.makeFolder(f'{self.path}/cookies/')
        if Path(self.cookies).is_file():
            move(self.cookies, f'{self.cookies}.bak')
        with open(self.cookies, 'wb') as filehandler:
            dump(self.driver.get_cookies(), filehandler)

    def loadCookies(self):
        # Loads the cookies
        if exists(self.cookies):
            self.getToHome()
            self.logIt(f'Loading cookies')
            with open(self.cookies, 'rb') as cookiesfile:
                cookies = load(cookiesfile)
                for cookie in cookies:
                    self.driver.add_cookie(cookie)
        else:
            raise ValueError('Cookies file not found!')

    def isRead(self, person, elem):
        # Checks if the conversation is read
        if '_1ht3' in elem.get_attribute('outerHTML'):
            self.logIt(f'Conversation with {person} is not read')
            self.iter = 2
            return False
        else:
            self.logIt(f'Conversation with {person} is read')
            return True

    def getPerson(self, person, override):
        # Checks if the conversation is read and if so it gets in
        self.person = person
        try:
            elem = self.getConversations(person)[0]
            if self.isRead(person, elem) or override:
                elem.click()
                self.faults = 0
                return True
        except Exception as e:
            self.logIt(f'{person} not rechable ({e})')
            if self.pressEscape():
                self.logIt('Pressed escape!')
            else:
                self.faults += 1
        return False

    def downloadMedia(self, link):
        # Downloads the media
        self.logIt(f'Downloading media with person {self.person}')
        file = findall('/(\d+_\d+_\d+_\w\.\w{3,4})\?', link)[0]
        if not Path(f'{self.mediapath}{file}').is_file():
            call([
                'curl', '-s', link, '-o',
                f'{self.mediapath}/{self.person}/{file}'
            ])
            return False
        else:
            self.iter = self.iter_L
            return True

    def getMedia(self):
        # Finds the media inside the conversation
        self.logIt(f'Trying to get media')
        photo = self.driver.find_element_by_class_name('_4-od')
        image = findall('src="(.*?)"', photo.get_attribute('innerHTML'))[0]
        return self.downloadMedia(image.replace('amp;', ''))

    def existsMedia(self, media):
        # Checks if the media exists
        medianame = findall('/(\d+_\d+_\d+_\w\.\w{3,4})\?', media)[0]
        if not Path(self.logpath).is_file():
            with open(self.logpath, 'w') as f:
                f.write('---\n')
        with open(self.logpath, 'r+') as f:
            if medianame not in f.read():
                f.write(f'{medianame}\n')
                self.logIt('Media does not exist, fetching it')
                return False
            else:
                self.logIt('Media exists')
                return True

    def manageThread(self, person, override):
        # Wrapper
        lenght = 0
        now = 0
        while self.getPerson(person, override):
            try:
                lenght = self.getToThread(now)
                now += 1
                if now == lenght or now > self.maxBack:
                    return
            except:
                return
        return

    def manageThreads(self, override):
        # Main wrapper function
        for person in self.people:
            self.logIt(f'Now on person {person}')
            self.manageThread(person, override)
        self.logIt(f'Sleeping for {self.iter} secs')
        sleep(self.iter)
        return True if self.faults < self.faultThreashold else False

    def pressEscape(self):
        # Presses escape
        elements = ['_4-od', '_3m31', '_1enh']
        for element in elements:
            try:
                elem = self.driver.find_element_by_class_name(element)
                elem.send_keys(Keys.ESCAPE)
                return True
            except:
                pass
        return False

    def makeFolder(self, path):
        # Makes folder if it doesn't exist
        if not exists(path):
            makedirs(path)
            self.logIt(f'Making a new forlder in {path}')

    def logIt(self, message):
        # Logs events and prints on screen debugging information
        self.signal.set()
        with open(self.logFile, 'a') as log:
            log.write(
                f'[{self.name}]: {message}, t+{round(time()-self.start)}s\n')
        if self.debug:
            with self.lock:
                print(
                    f'[{self.name}]: {message}, t+{round(time()-self.start)}s')
示例#13
0
class InstagramComments(object):
    """
    Instagram scraper
    """
    def __init__(self):
        """
        Initializes the instance of the webdriver.
        Keeps all new opening links in one window.
        """
        self.firefox_options = Options()
        self.browser = Firefox(options=self.firefox_options)

    def login(self):
        """
        Login functionality
        Requires the log in information to be stored in the additional file: login_data
        """

        self.browser.get(
            'https://www.instagram.com/accounts/login/?source=auth_switcher')
        print('Opening the page')
        time.sleep(8)
        self.usernameInput = self.browser.find_elements_by_css_selector(
            'form input')[0]
        self.usernameInput.send_keys(login_data.USERNAME_INSTAGRAM)
        print('Input username')
        time.sleep(3)
        self.passwordInput = self.browser.find_elements_by_css_selector(
            'form input')[1]
        self.passwordInput.send_keys(login_data.PASSWORD_INSTAGRAM)
        print('Input password')

        try:
            self.button_login = self.browser.find_element_by_class_name(
                'button')
        except:
            self.button_login = self.browser.find_element_by_xpath(
                '/html/body/div[1]/section/main/div/article/div/div[1]/div/form/div[4]/button/div'
            )

        time.sleep(4)

        self.button_login.click()
        print('Logged in')
        time.sleep(5)

        self.cookies = pickle.dump(self.browser.get_cookies(),
                                   open('cookies.pkl', 'wb'))

        try:
            self.notnow = self.browser.find_element_by_css_selector(
                'body > div.RnEpo.Yx5HN > div > div > div.mt3GC > button.aOOlW.HoLwm'
            )
            self.notnow.click()
        except:
            pass

    def get_post_links(self, username, post_count):
        """
        Crawler to get a list of links to the posts starting from the chronologically most recent
        :param username: str, the username of the account
        :param post_count: int, amount of links desired to save
        :return: a list of links to the post of the specific user
        """

        self.post_links = []
        self.browser.get(
            'https://www.instagram.com/accounts/login/?source=auth_switcher')
        self.cookies = pickle.load(open('cookies.pkl', 'rb'))
        for cookie in self.cookies:
            self.browser.add_cookie(cookie)
        self.username = username
        self.url = 'https://www.instagram.com/' + username + '/'
        self.browser.get(self.url)
        time.sleep(10)
        self.post = 'https://www.instagram.com/p/'
        while len(self.post_links) < post_count:
            self.links = [
                a.get_attribute('href')
                for a in self.browser.find_elements_by_tag_name('a')
            ]
            for link in self.links:
                if self.post in link and link not in self.post_links:
                    self.post_links.append(link)
            self.scroll_down = 'window.scrollTo(0, document.body.scrollHeight);'
            self.browser.execute_script(self.scroll_down)
            time.sleep(10)
        else:
            print(self.post_links)
            print(len(self.post_links))
            return self.post_links[:post_count]

    def get_post_details(self, url):
        """
        Saves the information about the instagram post: number of likes,
        type of the post (photo or video), caption, timestamp with timezone
        of when it was posted, username of the author
        :param url: str, link to the post
        :return: all the elements and send them to the database functionality
        """

        self.browser.get(url)
        try:
            self.likes = self.browser.find_element_by_xpath(
                """//*[@id="react-root"]/section/main/div/div/
                    article/div[2]/section[2]/div/div/button/span""").text
            self.post_type = 'photo'
        except:
            self.likes = self.browser.find_element_by_xpath(
                """//*[@id="react-root"]/section/main/div/div/
                    article/div[2]/section[2]/div/span""").text.split()[0]
            self.post_type = 'video'
        self.time_posted = self.browser.find_element_by_xpath(
            '//a/time').get_attribute("datetime")
        try:
            self.caption = self.browser.find_element_by_xpath(
                """/html/body/div[1]/section/main/div/div/article/div[2]/div[1]/ul/div/li/div/div/div[2]/span"""
            ).text
        except NoSuchElementException as e:
            self.caption = ""
        try:
            return DatabaseFunctionality.execute_insert_post_details(
                url, self.post_type, self.likes, self.time_posted,
                self.caption)
        except psycopg2.errors.DatabaseError:
            pass
        time.sleep(8)

    def get_comments(self, url):
        """
        Saves the comments of the post: username of the authour of the comment, comment itself,
        timestamp with timezone,
        :param url: link to the post
        :return:
        """

        self.browser.get(
            'https://www.instagram.com/accounts/login/?source=auth_switcher')
        self.cookies = pickle.load(open('cookies.pkl', 'rb'))
        for cookie in self.cookies:
            self.browser.add_cookie(cookie)
        self.browser.get(url)
        time.sleep(5)

        try:
            self.load_more_comments = self.browser.find_element_by_class_name(
                'glyphsSpriteCircle_add__outline__24__grey_9 u-__7')
            self.action = ActionChains(self.browser)
            self.action.move_to_element(self.load_more_comments)
            self.load_more_comments.click()
            time.sleep(4)
            self.action.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
        except Exception as e:
            pass

        time.sleep(5)
        comment = self.browser.find_elements_by_class_name('gElp9 ')
        for c in comment:
            container = c.find_element_by_class_name('C4VMK')
            name = container.find_element_by_class_name('_6lAjh').text
            content = container.find_element_by_tag_name('span').text
            content = content.replace('\n', ' ').strip().rstrip()
            time_of_post = self.browser.find_element_by_xpath(
                '//a/time').get_attribute("datetime")
            comment_details = {
                'url_post': url,
                'profile name': name,
                'comment': content,
                'time': time_of_post
            }
            print(comment_details)
            try:
                return DatabaseFunctionality.execute_insert_comment_details(
                    url, name, content, time_of_post)
            except psycopg2.errors.DatabaseError as e:
                pass
示例#14
0
class crawlerAliexpress():
    #global numberItem
    def __init__(self, searchName, numberPage=1):
        # Инициализация браузера.
        opts = Options()
        #opts.set_headless()
        self.browser = Firefox(options=opts)
        self.searchName = searchName
        self.numberPage = numberPage
        self.listDetailsProducts = []

        currentUrl = f'https://aliexpress.ru/wholesale?SearchText={self.searchName}&page={self.numberPage}'
        self.browser.get(currentUrl)
        cookie = {
            'name': 'aep_usuc_f',
            'value': 'isfm=y&site=rus&c_tp=RUB&isb=y&region=RU&b_locale=ru_RU',
            'domain': '.aliexpress.ru'
        }
        self.browser.add_cookie(cookie)
        self.browser.get(currentUrl)
        sleep(1)

    def scroll_down_page(self, speed=8):
        current_scroll_position, new_height = 0, 1
        while current_scroll_position <= new_height:
            current_scroll_position += speed
            self.browser.execute_script(
                "window.scrollTo(0, {});".format(current_scroll_position))
            new_height = self.browser.execute_script(
                "return document.body.scrollHeight")

    def getProductsDetail(self, countPage):
        #scrollPauseTime = 2
        self.countPage = countPage
        self.scroll_down_page()
        title = self.browser.find_elements_by_xpath(
            "//li[@class='list-item']//div[@class='item-title-wrap']//a[@class='item-title']"
        )
        price = self.browser.find_elements_by_xpath(
            "//li[@class='list-item']//div[@class='hover-help']//div[@class='item-price-row']"
        )
        rating = self.browser.find_elements_by_xpath(
            "//li[@class='list-item']//div[@class='hover-help']//span[@class='rating-value']"
        )
        numberOfSales = self.browser.find_elements_by_xpath(
            "//li[@class='list-item']//div[@class='hover-help']//a[@class='sale-value-link']"
        )
        seller = self.browser.find_elements_by_xpath(
            "//li[@class='list-item']//div[@class='hover-help']//a[@class='store-name']"
        )
        itemsTitle = len(title)
        itemsPrice = len(price)
        itemsRating = len(rating)
        itemsNumberOfSales = len(numberOfSales)
        itemsSeller = len(seller)

        for i in range(
                min(itemsTitle, itemsPrice, itemsRating, itemsNumberOfSales,
                    itemsSeller)):
            itemProduct = [
                title[i].text, price[i].text, rating[i].text,
                numberOfSales[i].text, seller[i].text
            ]
            listDetailsProducts.append(itemProduct)

        self.paginator(self.countPage)

    def saveInCsv(self, nameFile):
        self.nameFile = nameFile
        with open(self.nameFile, "w", newline='') as out_file:
            writer = csv.writer(out_file)
            writer.writerows(listDetailsProducts)

    def check_exists_by_xpath(self, xpath):
        try:
            self.browser.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True

    def paginator(self, countPage):
        sleep(2)
        isPaginator = self.check_exists_by_xpath(
            "//button[@class='next-btn next-medium next-btn-normal next-pagination-item next-next' and not(@disabled)]"
        )
        self.numberPage += 1
        currentUrl = f'https://aliexpress.ru/wholesale?SearchText={self.searchName}&page={self.numberPage}'
        if isPaginator and (countPage > 1):
            self.browser.get(currentUrl)
            self.getProductsDetail(countPage - 1)
        else:
            print('\nВсе страницы обработали')
            self.browser.close()
示例#15
0
def set_login_data(driver: webdriver.Firefox, login_data: LoginData):
    """ Goes to 404 page of each site in order to set authentication data """
    driver.get(login_data.link)
    for cookie in login_data.cookies:
        driver.add_cookie(cookie)
    set_localstorage(driver, login_data.localstorage)
# driver.find_element_by_xpath("//div[@id='u1']//a[text()='登录']").click()
# sleep(1)
# driver.find_element_by_xpath("//p[text()='用户名登录']").click()
# sleep(2)
# # 输入用户名
# driver.find_element_by_xpath(
#     "//input[@name='userName']").send_keys('13541781424')
# # 输入密码
# driver.find_element_by_xpath("//input[@name='password']").send_keys('19931025')
# # 点击登录
# driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__submit']").click()
# cookie=driver.get_cookies()
# print(cookie)
cook = {
    'domain':
    '.baidu.com',
    'name':
    'BDUSS',
    'path':
    '/',
    'value':
    'WhSMUpIOXhkOXBrZTF4TmRKMG1YYUkyb0JscGdNRE9kZH40WmpBaUZES35YVGhkSVFBQUFBJCQAAAAAAAAAAAEAAAD4pMmGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL~QEF2~0BBdQ'
}

# 打开网页
driver.get('https://www.baidu.com')
# 添加cookit
driver.add_cookie(cook)
sleep(3)
# 进入网页
driver.get('http://i.baidu.com')