예제 #1
0
class GoogleImages():
    def __init__( self ):
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.base_url = 'https://www.google.com/search?q=%s&tbm=isch'
        self.path_to_chromedriver = './chromedriver'
        self.browser = webdriver.Chrome(executable_path = self.path_to_chromedriver)
        self.browser = webdriver.Chrome()

    def crawl(self, qry ):
        url = self.base_url % ( '+'.join(qry) )
        self.browser.get(url)
        self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
        pages = self.browser.page_source
        soup = BeautifulSoup(pages,'lxml')
        x = soup.findAll( 'div', 'rg_di rg_el ivg-i' )
        print len(x)
        imgs = [ y.findAll('a')[0]['href'] for y in x ]
        imgurls = [ ( x.split('imgurl=')[1].split('&')[0],\
                    x.split('imgurl=')[1].split('&')[1].replace('imgrefurl=','') )\
                    for x in imgs ]
        return imgurls

    def stop():
        browser.quit()
        display.stop()
예제 #2
0
    def run(self):
        """Run the SelScraper."""

        display = Display(visible=0, size=(800, 600))
        display.start()
        #self._set_xvfb_display()

        if not self._get_webdriver():
            raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name),
                         exception_obj=SeleniumMisconfigurationError)

        try:
            self.webdriver.set_window_size(400, 400)
            self.webdriver.set_window_position(400 * (self.browser_num % 4), 400 * (math.floor(self.browser_num // 4)))
        except WebDriverException as e:
            out('Cannot set window size: {}'.format(e), lvl=4)

        super().before_search()

        if self.startable:
            self.build_search()
            self.search()

        if self.webdriver:
            self.webdriver.close()
예제 #3
0
파일: utils.py 프로젝트: Cruel/Anondex
def webthumb(url, filename, is_flash=False):
    script = """
        var s = document.createElement('script');
        s.src = 'http://cruels.net/sb/flashfix.js';
        document.body.appendChild(s);
    """
    print "webthumb(%s, %s)" % (url, filename)
    display = Display(visible=0, size=(1200, 900))
    display.start()
    browser = webdriver.Firefox()
    browser.get(url)
    if is_flash:
        time.sleep(1)
    else:
        browser.execute_script(script)
        time.sleep(6)
    tmpfile = "%s.tmp" % filename
    browser.get_screenshot_as_file(tmpfile)
    img = pil.open(tmpfile)
    width, height = img.size
    if is_flash:
        resized = img.resize((LIBRARYFILE_THUMB_WIDTH, LIBRARYFILE_THUMB_HEIGHT), pil.ANTIALIAS)
    else:
        ratio = float(width) / float(height)
        resized = img.resize((LIBRARYFILE_THUMB_WIDTH, int(LIBRARYFILE_THUMB_WIDTH / ratio)), pil.ANTIALIAS)
    resized.save(filename)
    os.remove(tmpfile)
    print "Saved %s." % filename
    browser.quit()
    display.stop()
    return True
def get_driver(browser, display):

    dricve=1

    if display==0:
        display = Display(visible=0, size=(800, 600))
        display.start()


    if browser and 'chrome' in browser.lower():
        options = webdriver.ChromeOptions()
        #prefs = {"download.default_directory" : folder}
        options.add_argument('--user-agent={}'.format(random.choice(USER_AGENTS)))
        options.add_experimental_option("prefs",prefs)


        if dricve == 1:
            return webdriver.Chrome(chrome_options=options)
        else:
            return webdriver.PhantomJS()#
    else:
        profile = webdriver.FirefoxProfile()
        profile.set_preference('general.useragent.override', random.choice(USER_AGENTS))
        #profile.set_preference("browser.download.folderList",2);
        #profile.set_preference("browser.download.manager.showWhenStarting",false);
        #profile.set_preference("browser.download.dir",folder);
        profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
        return webdriver.Firefox(profile)
예제 #5
0
class UITestCase(LiveServerTestCase):
    def use_xvfb(self):
        from pyvirtualdisplay import Display
        self.display = Display('xvfb',
                               visible=1,
                               size=(1280, 1024))
        self.display.start()
        self.driver = WebDriver()

    def setUp(self):
        try:
            self.driver = WebDriver()
            ui_is_not_available = False
        except WebDriverException:
            ui_is_not_available = True

        if ui_is_not_available:
            self.use_xvfb()

        self.driver.implicitly_wait(10)
        super(UITestCase, self).setUp()

    def tearDown(self):
        self.driver.quit()
        if hasattr(self, 'display'):
            self.display.stop()

        super(UITestCase, self).tearDown()
예제 #6
0
def main():
    '''business logic for when running this module as the primary one!'''
    display = Display(visible=0, size=(1024, 768))
    display.start()

    fresh_cl_post = find_cl_post()
    prev_cl_post = {"title":"","link":""}
    old_cl_post = {"title":"","link":""}
    
    # find_cl_post()
    while True:
        # print "TEST" + str(datetime.date.today())
        fresh_cl_post = find_cl_post()
        
        try:
            if fresh_cl_post['title'] != prev_cl_post['title']:
            
                old_cl_post = prev_cl_post
                prev_cl_post = fresh_cl_post
            
                send_cl_email(fresh_cl_post)

        except:
            print "Failed to test & send mail at: "+str(datetime.datetime.now())

        gc.collect()
        time.sleep(SLEEP_SECONDS)
        
    
    
    display.stop()
예제 #7
0
class Xvfb(object):
    def __init__(self, width=1366, height=768, visible=0):
        self.__virtual_display = None
        self.width = width
        self.height = height
        self.visible = visible

    def __init_display(self):
        if self.__virtual_display is None:
            self.__virtual_display = Display(visible=self.visible, size=(self.width, self.height))
            self.__virtual_display.start()

    def __enter__(self):
        self.__init_display()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._close_display()

    def _close_display(self):
        if self.__virtual_display:
            try:
                self.__virtual_display.close()
            except:
                pass
        self.__virtual_display = None

    @staticmethod
    def run(func, *args, **kwargs):
        runner = Xvfb()
        with runner:
            return func(*args, **kwargs)
예제 #8
0
class BCCVLTestCase(unittest.TestCase):
    def setUp(self):
        # acquire URL, username and password from environment variables, or use default values for dev env.
        self.username = os.getenv("BCCVL_TEST_USERNAME", "admin")
        self.password = os.getenv("BCCVL_TEST_PASSWORD", "admin")
        self.url = os.getenv("BCCVL_TEST_URL", "https://192.168.100.200/")

        # The amount of time selenium will potentially wait in searching for elements. This is blocking.
        implicit_wait = int(os.getenv("BCCVL_TEST_IMPLICIT_WAIT", "15"))

        # Run tests in a virtual display (xvfb)
        virtual_display = os.getenv("BCCVL_TEST_VIRTUAL_DISPLAY", "false") == "true"

        # Setup the virtual display
        if virtual_display:
            self.display = Display(visible=0, size=(1920, 1080))
            self.display.start()
        else:
            self.display = None

        # Setup the Firefox Profile and webdriver
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(implicit_wait)

        # Maximize the window
        # self.driver.maximize_window()
        self.driver.set_window_size(1200, 800)

        # Go to the bccvl homepage
        self.driver.get(self.url)

    def tearDown(self):
        if self.display:
            self.display.stop()
        self.driver.quit()
예제 #9
0
def getupc(data, sleeptime):
    display = Display(visible=0, size=(800, 600))
    display.start()
    a = webdriver.Firefox()
    a.get('https://www.google.com/ncr')
    time.sleep(sleeptime)
    search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']")))
    for i in data:
        ActionChains(a).move_to_element(search).click(search).send_keys(i['name'] + ' upc', Keys.ENTER).perform()
        time.sleep(sleeptime)
        contents = WebDriverWait(a, 5).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='g']")))
        try:
            upc = next(
                    (re.split(r'/', href.find_element_by_tag_name('a').get_attribute('href'))[-1] for
                     href in contents if
                     href.find_element_by_tag_name('a').get_attribute('href').startswith(
                             'http://www.upcitemdb.com/upc')))
            i['upc'] = upc
        except StopIteration:
            pass

        search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']")))
        search.clear()
    a.close()
    display.stop()
    return data
예제 #10
0
def get_screenshot(site_id, update_id):
    """
    Create a screenshot and save it to the database
    """
    # Get the objects we're working with
    site = Site.objects.get(id=site_id)
    update = Update.objects.get(id=update_id)
    
    # Fire up a headless display to work in
    display = Display(visible=0, size=(1680, 1050))
    display.start()
    
    # Fire up a Selenium browsers
    browser = webdriver.Firefox()
    
    # Set a timeout for the pageload
    seconds = 15
    browser.command_executor._commands['setPageLoadTimeout'] = (
        'POST', '/session/$sessionId/timeouts'
    )
    browser.execute("setPageLoadTimeout", {
        'ms': 1000*seconds,
        'type':'page load'
    })
    
    # Snap a screenshot of the target site
    logger.debug("Opening %s" % site.url)
    timestamp = timezone.now()
    try:
        browser.get(site.url + "?x=" + get_random_string())
        logger.debug("Response received for %s" % site.url)
    except TimeoutException, e:
        logger.error("Request for %s timed out" % site.url)
        pass
예제 #11
0
 def load(self):
     min_time = 3600 # 1 hour in seconds
     max_time = 7179 # 2 hours in seconds (less 21)
     tasktime = randint(min_time, max_time)
     threading.Timer(tasktime, self.load).start()
     tasktime_m , tasktime_s = divmod( tasktime , 60)
     tasktime_h , tasktime_m = divmod( tasktime_m , 60) 
     output_content = "Load execution - waiting %dh %02dmin %02dsec for the next time." % (tasktime_h, tasktime_m, tasktime_s)
     print "[KeepUp]" , output_content
     
     from selenium import webdriver
     from selenium.webdriver.common.by import By
     from selenium.webdriver.support.ui import WebDriverWait
     from selenium.webdriver.support import expected_conditions as ec
     from selenium.webdriver.common.keys import Keys
     from pyvirtualdisplay import Display
     
     # Initial
     display = Display(visible=0, size=(1600, 900))
     display.start()
     profile = webdriver.FirefoxProfile()
     profile.set_preference("browser.cache.disk.enable", False)
     profile.set_preference("browser.cache.memory.enable", False)
     profile.set_preference("browser.cache.offline.enable", False)
     profile.set_preference("network.http.use-cache", False)
     driver = webdriver.Firefox()
     driver.get("https://c9.io/dashboard.html")
     driver.save_screenshot(self.directory_img + 'login.png')
     
     #Username
     username = driver.find_element_by_id("id-username")
     username.click()
     username.clear()
     username.send_keys(self.user, Keys.ARROW_DOWN)
     
     #Password
     password = driver.find_element_by_id("id-password")
     password.click()
     password.clear()
     password.send_keys(self.password, Keys.ARROW_DOWN)
     
     #Submit
     submit_button = driver.find_element_by_css_selector("button[type=submit]")
     # print submit_button.text
     
     # Click submition
     submit_button.click();
     time.sleep(5)
     driver.save_screenshot(self.directory_img + 'user_profile.png')
     
     # Target dir
     driver.get(self.target_workspace)
     time.sleep(10)
     
     self.log({'log_html': driver.page_source, 'log_file': output_content}) #make log
     driver.save_screenshot(self.directory_img + 'final_workspace.png')
     
     # End
     driver.quit()
     display.stop()
예제 #12
0
def main(param):

    if len(param) != 2:
        sys.exit(-9)
    if len(param[1]) <= 0:
        sys.exit(-8)
    paths = param[0]
    shotsdir = paths.get('path', 'output.shotsdir').lstrip('"').rstrip('"')
    targets = param[1]

    display = Display(visible=0, size=(800, 600))
    display.start()

    binary = FirefoxBinary('/opt/firefox/firefox')
    browser = webdriver.Firefox(firefox_binary=binary)

    tgt_len = len(targets)
    for i, tgt in enumerate(targets):
        browser.get(tgt[0])
        browser.save_screenshot(shotsdir+'/'+tgt[1]+'.png')
        print '( %3d / %3d ) Took %s.png' % (i+1, tgt_len, tgt[1])

    browser.quit()

    display.stop()
예제 #13
0
파일: rzhd_3.py 프로젝트: dmitrybezb/rzd
def rzhd():
    directions=[create_url(),]

    while raw_input('Want to add more directions? y/n ')=='y':
        directions.append(create_url())
        print "------------------"
    # n=raw_input('Check tickets every ...(seconds)? ')
    n = 60

    place=choose_place()
    i = 0
    display = Display(visible=0, size=(5, 5))
    display.start() # Запускаем вирутальный дисплей
    while len(directions)!=0:
        i+=1
        print
        print "----------------->Searching for PLATSKART<-----------------"

        print "try #",i
        print time.asctime()
        print

        for url in directions:
            if find_train(url, place)==True:
                send_email('*****@*****.**', url)
                if raw_input('Did you buy ticket? y/n ')=='y':
                    directions.remove(url)
                    if len(directions) == 0:
                        print "Successfully bought all tickets!"
                        return True                
            print str(n)+" seconds until next try..."
            time.sleep(float(n)) # Дадим браузеру корректно завершиться
    display.stop() # Закрываем виртуальный дисплей
예제 #14
0
def loadSite(url):
    profile = webdriver.FirefoxProfile()
    profile.set_preference("network.proxy.type", 1)
    profile.set_preference("network.proxy.http", "74.84.131.34")
    profile.set_preference("network.proxy.http_port", int('80'))
    profile.update_preferences()
    #
    display = Display(visible=0, size=(800, 600))
    display.start()
    path_to_chromedriver = '/home/alexandr/www/html/python/prs/files/geckodriver'
    browser = webdriver.Firefox(firefox_profile = profile, executable_path = path_to_chromedriver)
    #
    browser.delete_all_cookies()
    browser.get(url)
    #print(browser.page_source)
    #print(browser.page_source)
    tree = etree.HTML( browser.page_source)
    #
    browser.close()
    display.stop()
    #
    nodes = tree.xpath('//table[@class="network-info"]//tr/td')
    for node in nodes:
        print(node.text)
    return 1
    def process_install_form (self):
        if (self.args.xvfb):
	    print "Omeka is being installed in: " + self.folder_name
            display = Display(visible=0, size=(800, 600))
            display.start()
        driver = webdriver.Firefox()
        driver.get("http://localhost/omeka/" + self.folder_name + "/install")
        inputElement = driver.find_element_by_name("username")
        inputElement.send_keys(self.omeka_user)
        inputElement = driver.find_element_by_name("password")
        inputElement.send_keys(self.omeka_passwd)
        inputElement = driver.find_element_by_name("password_confirm")
        inputElement.send_keys(self.omeka_passwd)
        inputElement = driver.find_element_by_name("super_email")
        inputElement.send_keys("*****@*****.**")
        inputElement = driver.find_element_by_name("administrator_email")
        inputElement.send_keys("*****@*****.**")
        inputElement = driver.find_element_by_name("site_title")
        inputElement.send_keys(self.omeka_title)
        inputElement.submit()
        try:
            WebDriverWait(driver, 10).until(
                lambda driver : driver.find_element_by_partial_link_text("Tableau"))
        finally:
            driver.quit()
예제 #16
0
class TestCase(unittest.TestCase):
    def setUp(self):
        app.config['TESTING'] = True
        app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
        self.app = app.test_client()
        db.create_all()

        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Firefox()

    def tearDown(self):
        db.session.remove()
        db.drop_all()

        self.driver.quit()
        self.display.stop()

    def test_extract_funds(self):
        funds = extract_funds(
            # some javascript going on that I can't figure out how to mock
            #'file:///%s/t/test_files/list_mutual_funds.html' % basedir,

            self.driver
        )

        self.assertTrue(len(funds) > 110)
예제 #17
0
def main(args):
    parser = argparse.ArgumentParser(description="Program for running tests on the PATRIC web interface.")
    parser.add_argument("user", metavar="user", help="Patric login username.")
    parser.add_argument("passwd", metavar="passwd", help="Patric login password.")
    parser.add_argument("--firebug", action="store_true", help="Open Firebug during test.")
    args = parser.parse_args()

    fp = webdriver.FirefoxProfile()
    if args.firebug:
        fp.add_extension(extension='extras/firebug-2.0.9.xpi')
        fp.set_preference("extensions.firebug.currentVersion", "2.0.9") #Avoid startup screen
        fp.set_preference("extensions.firebug.console.enableSites", "true")
        fp.set_preference("extensions.firebug.net.enableSites", "true")
        fp.set_preference("extensions.firebug.script.enableSites", "true")
        fp.set_preference("extensions.firebug.allPagesActivation", "on")

    # Create virtual display
    display = Display(visible=0, size=(1400, 950))
    display.start()

    # Create webdriver and retrieve url
    driver = webdriver.Firefox(firefox_profile=fp)
    driver.get(SITE_URL + '/login')

    # Wait for username input box to appear
    WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "dijit_form_TextBox_0")))

    # Set username and password, click login button
    userElement = driver.find_element_by_id("dijit_form_TextBox_0")
    pwdElement = driver.find_element_by_id("dijit_form_TextBox_1")
    userElement.send_keys(args.user)
    pwdElement.send_keys(args.passwd)
    loginElement = driver.find_element_by_id("dijit_form_Button_1")
    loginElement.click()
    time.sleep(3)

    # Retrieve home page, wait for an expected page element to load, take a screenshot
    driver.get(SITE_URL + '/portal/portal/patric/Home')
    WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "cart")))
    driver.set_window_size(1400, 950)
    driver.execute_script("window.scrollTo(0,0);")
    driver.get_screenshot_as_file("homepage_after_login.jpg")
    print "Saved screenshot to: homepage_after_login.jpg\n"

    # Retrieve ws url, wait for create folder button to appear
    ws_url = SITE_URL + '/workspace/' + args.user + '@patricbrc.org/home'
    driver.get(ws_url)
    WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer")))
    time.sleep(5)

    # Have to reload page, because often time the workspace is empty on first load
    driver.get(ws_url)
    WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer")))
#    createFolderButton = driver.find_element_by_class_name("ActionButton fa icon-folder-plus fa-2x")
#    createFolderButton.click()
    time.sleep(30)

    driver.quit()
    display.stop()
    return 0
예제 #18
0
파일: f.py 프로젝트: kamekame/alpha
def get_news():
    if check_wlan():
        from pyvirtualdisplay import Display
        import re

        display = Display(visible=0, size=(800, 600))
        display.start()

        driver = webdriver.Firefox()
        url = "http://www.deutschlandfunk.de/"
        driver.get(url)
        source = driver.find_element_by_xpath('//*[@id="wrapper"]/div/section[2]/div[1]').get_attribute('innerHTML')

        n_articles = source.count('<article')
        print(str(n_articles) + " articles found.")

        lst = re.findall('<h3>(.+)</h3>', source)
        result = lst

        driver.close()

        display.stop()
        return result
    else:
        print("Error: Not connected to the internet")
예제 #19
0
class FunctionalTest(StaticLiveServerTestCase):
    @classmethod
    def setUpClass(cls):
        for arg in sys.argv:
            if 'liveserver' in arg:
                cls.server_url = 'http://' + arg.split('=')[1]
                return
        super().setUpClass()
        cls.server_url = cls.live_server_url

    @classmethod
    def tearDownClass(cls):
        if cls.server_url == cls.live_server_url:
            super().tearDownClass()

    def setUp(self):
        self.display = Display(visible=0, size=(1024, 768))
        self.display.start()
        self.browser = webdriver.Firefox()
        # self.browser.implicitly_wait(3)

    def tearDown(self):
        self.browser.quit()
        self.display.stop()

    def check_for_row_in_list_table(self, row_text):
        table = self.browser.find_element_by_id('id_list_table')
        rows = table.find_elements_by_tag_name('tr')
        self.assertIn(row_text, [row.text for row in rows])
예제 #20
0
class AdminTestCase(LiveServerTestCase):

    def setUp(self):
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()

        self.selenium = webdriver.Firefox()

        super(AdminTestCase, self).setUp()

    def tearDown(self):
        self.selenium.quit()
        self.display.stop()
        super(AdminTestCase, self).tearDown()

    def test_payment(self):
        """
        payment will be successful.
        """

        self.selenium.get("%s/pay" % self.live_server_url)
        self.selenium.implicitly_wait(20)
        self.selenium.maximize_window()

        self.selenium.find_element_by_name("amount").send_keys("100000")

        pay_button = self.selenium \
            .find_element_by_xpath('//input[@value="pay"]')
        pay_button.click()

        return_to_site_button = self.selenium.find_element_by_id("btn3")

        return_to_site_button.click()

        self.assertIn("successful", self.selenium.page_source)
def work():

    logging.info("start weeklys screenshot work")
    print ("start ... ")

    if not DISPLAY:
        print ("hide display ... ")
        display = Display(visible=0, size=(1366, 768))
        display.start()

    config = getConfigObj()
    if config == None:
        return False
    userName = config.get("USER", "UserName")
    userPWD = config.get("USER", "userPWD")

    ret = getTowerWeeklyScreenshot(userName, userPWD, DEFAULT_SAVE_PATH)

    if not ret:
        print ('Error, abort. Please check the log file "%s"' % LOG_FILE)
        return False

    logging.info("finish all work, exit.")

    if not DISPLAY:
        display.stop()

    return True
예제 #22
0
	def get_image(self):
		## Uses supplied scrape site to find new pictures
		url = self.scrape_site
		# virtual display for headless runs
		display = Display(visible=0, size=(800, 600))
		display.start()

		with closing(Firefox()) as browser:
			browser.get(url)
			time.sleep(5) # TODO: fix with something less static, but still
			# multipurpose considering scrape_site as a db var
			imgs = browser.find_elements_by_tag_name('img')
			# TODO: fix this temporary workaround that prevents ad server data
			# from reaching the image checks
			no_ad_imgs = [i for i in imgs if 'adsrvr' not in \
				i.get_attribute('src')]
			for img in no_ad_imgs:
				src = img.get_attribute('src')
				alt = img.get_attribute('alt')
				image_id = re.findall("/photo/(.+?)/", src)[0]
				if(self._check_id(image_id) and self._check_ratios(src)):
					self.img_id = image_id
					self.description = alt
					self._save_hd_image()
					break
		display.stop()
		if (self.img_id):
			return
		raise Exception('Failed to find a suitable image: all out or bugged')
예제 #23
0
def get_all_items():
    #list to store alll scraped data
    all_items = list()

    #Display - read about pyvirtualdisplay
    display = Display(visible=0, size=(1024, 768))
    display.start()
    #webdriver - read about selenium.webdriver
    driver = webdriver.Firefox()
    
    #this is a starting page we are scraping
    driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx")
    #Every element on the HTML page can be located using CSS selectors.
    #Opening the starting page in Chrome, right click on the drop-down menu, click "Inspect" we see a tag on the right highlighted, we copy it's id - MainContent_ddl_ReportForms
    #Knowing the id of dropdown menu, we can locate it with Selenium like this
    main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms")))
    #Drop down menu is an HTML table of options which can be verified in Chrome browser (Developer Tools, that pop up when you right click and press "Inspect" on an element)
    #Following returns all of the options - rows in that table
    form_options = main_menu.find_elements_by_tag_name("option")
    #We count them
    option_count = len(form_options)
    #Next, we loop over all of them - essentially like we scrolling down the drop down menu and clicking on each every form 
    for form_i in xrange(1,option_count):
        #Get web element corresponding to a form
        form = form_options[form_i]
        #Click as a mouse click-action in browser 
        form.click()
        #Get text, because we need to store the form number
        form_id = form.text
        #Locate a web element corresponding to the submit button. By CSS selector which we found by inspection in Chrome browser (same logic as above)
        submit_button = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_btn_GetForm")))
        #Click as a mouse click-action in browser 
        submit_button.click()      
        #Prepare data structures to store all the info we want to scrape
        a = dict.fromkeys(['Description','OMB','Background','RespondentPanel','Frequency','PublicRelease'])
        #We are on a web page after submit-click, following will search for all items of interest. Or for corresponding
        #web-elements 
        for el in a.keys():
            try:
                item = driver.find_element_by_css_selector("#MainContent_lbl_"+el+"_data") 
                #Once found it will store them in our dictionary, if not it will proceed to "except" section and do nothing
                a[el] = item.text 
            except: 
                #case when there is no such field
                pass
        #we need form number as well
        a['FormNumber'] = form_id
        #keeping them all in one list, which will have a dictionary per Form Number - and later, a row in your excel file per Form number
        all_items.append(a)
    
        #Ok, that part bothers me a little: it looks like I have to refresh "form_options" each time... 
        #Otherwise I get following exception: selenium.common.exceptions.StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up
        driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx")
        main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms")))
        form_options = main_menu.find_elements_by_tag_name("option")

    driver.close()
    display.stop()

    return all_items
예제 #24
0
def openurl(companyname=first_arg):
    display = Display(visible=0, size=(1024, 768))
    display.start()
    browser = webdriver.Firefox()
    time.sleep(randint(8,10))
    try:
        browser.get('http://www.google.com')
        time.sleep(5)
        search = browser.find_element_by_name('q')
        input_text = companyname + str(" crunchbase")
        search.send_keys(input_text)
        time.sleep(randint(10,15))
        search.send_keys(Keys.RETURN)
        time.sleep(randint(10,15))
        gn = browser.find_element_by_tag_name('h3').text
        gnc = str(gn).split(' | ')[0].replace(" ","")
        output_file = '0515' + gnc + '.html'
        browser.find_element_by_link_text(gn).click()
        time.sleep(randint(55,60))
        company_html = browser.page_source
        time.sleep(randint(5,10))
        with open("smallname.txt", 'a') as myfile:
            json.dump(output_file,myfile)
        with open(output_file, 'a+') as myfile:
            myfile.write(company_html)
    except:
        company_html = 'none'        
        with open("missedname.txt", "a") as myfile:
            json.dump(companyname,myfile)            
    time.sleep(1)
    browser.close()
    time.sleep(1)
    display.stop()
    return company_html
예제 #25
0
파일: screenshot.py 프로젝트: Aypak/ka-lite
def process_screenshots(app, env):
    if not hasattr(env, 'screenshot_all_screenshots'):
        return

    if not app.config['screenshots_create']:
        print("Not doing screenshots on maggies farm no more")
        return
        
    if 'SPHINX_SS_USE_PVD' in os.environ.keys() and os.environ['SPHINX_SS_USE_PVD'] == "true":
        from pyvirtualdisplay import Display
        # Start a virtual headless display
        display = Display(visible=0, size=(1024, 768))
        display.start()
    else:
        display = None
    
    # Don't bother building screenshots if we're just collecting messages.
    # Just checks if we invoked the build command with "gettext" in there somewhere
    if "gettext" in sys.argv:
        return
    all_args = map(lambda x: x['from_str_arg'], env.screenshot_all_screenshots)
    # If building in a different language, start the server in a different language
    command = SCREENSHOT_COMMAND + SCREENSHOT_COMMAND_OPTS + \
              [re.sub(r"\s", r"", "--from-str={0}".format(json.dumps(all_args)))]
    language = env.config.language
    if language:
        command += ["--lang={0}".format(language)]
    subprocess = Popen(command)
    subprocess.wait()
    try:
        if subprocess.returncode:
            raise Exception("Screenshot process had nonzero return code: {0}".format(subprocess.returncode))
    finally:
        if display:
            display.stop()
class BrowserManager:
	def __init__(self):
		self._lock = False
	def bootup(self):
		self._display = Display(visible=0, size=(1024, 768))
		self._display.start()
		profile = {}
		if 'HTTP_PROXY' in os.environ:
			proxy_url = os.environ['HTTP_PROXY']
			proxy_server = proxy_url.split(':')[1][2:]
			proxy_port = proxy_url.split(':')[-1]
			profile['network.proxy.type'] = 1
			profile['network.proxy.http'] = proxy_server
			profile['network.proxy.http_port'] = proxy_port
			profile['network.proxy.https'] = proxy_server
			profile['network.proxy.https_port'] = proxy_port
		self.browser = Browser(profile_preferences=profile)
	def obtain(self,background):
		while self._lock:
			background.wait('Browser lock', 15)
		self._lock = True
		return self.browser
	def release(self,background):
		self._lock = False
	def shutdown(self):
		self.browser.quit()
		self._display.stop()
예제 #27
0
파일: runner.py 프로젝트: benvand/profiler
class SeleniumRunner(object):
    def __call__(self, f):
        @functools.wraps(f)
        def decorated(_self, *args, **kwargs):
            with self as driver:
                return f(_self, driver, *args, **kwargs)
        return decorated

    def __enter__(self):
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome()
        return self.driver

    def __exit__(self, *args, **kwargs):
        try:
            self.driver.quit()
        except (AttributeError,) as e:
            # Someone has messed with our browser
            pass
        try:
            self.display.stop()
        except (AttributeError,) as e:
            # Someone has messed with our display
            pass
예제 #28
0
class Spider(scrapy.Spider):
    name = "mayors"
    allowed_domains = ["www.cec.gov.tw"]
    start_urls = ["https://www.cec.gov.tw/pc/zh_TW/IDX/indexC.html",]
    download_delay = 1

    def __init__(self, ad=None, *args, **kwargs):
        super(Spider, self).__init__(*args, **kwargs)
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")

    def spider_closed(self, spider):
        self.display.close()

    def parse(self, response):
        self.driver.get(response.url)
        nodes = scrapy.Selector(text=self.driver.page_source).xpath('//a[@target="_top"]')
        for node in nodes:
            county = node.xpath('text()').extract_first()
            print county
            yield response.follow(node, callback=self.parse_list, meta={'meta': county})

    def parse_list(self, response):
        for tr in response.css(u'table.tableT tr.trT'):
            d = {}
            d['type'] = 'mayors'
            d['county'] = response.meta['meta']
            d['constituency'] = 0
            d['elected'] = tr.xpath('td[1]/text()').extract_first().strip()
            d['number'] = int(tr.xpath('td[2]/text()').extract_first())
            d['votes'] = int(re.sub('\D', '', tr.xpath('td[5]/text()').extract_first()))
            d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first()
            yield d
예제 #29
0
def virtual_display_if_enabled(enabled):
    if enabled:
        display = Display(visible=0, size=(800, 600))
        display.start()
        return display
    else:
        return NoopDisplay()
def run_selenium(landmark):
	display = Display(visible=0, size=(800, 600))
	display.start()
        logTo(TEST_LOG,'Selenium : Starting Selenium  for '+landmark,'INFO','a')
	interFace=open(HOME_DIR+'/Desktop/one-time-test-suite/iface.txt','r')
	tmp=interFace.readlines()
	iface=tmp[0].split('\n')[0]
	tmpstmp=datetime.now().strftime("%s")
	profile = webdriver.FirefoxProfile()
	profile.update_preferences()
	browser = webdriver.Firefox(firefox_profile=profile) # assign profile to browser
	browser.delete_all_cookies()
	logTo(TEST_LOG,' Selenium : Starting tcpdump .. ','INFO','a')
	tcpcmd='tcpdump -i '+iface+' -w '+EXP_DIR+'/'+'tcpdump_'+landmark.split('.')[0]+'_'+tmpstmp
	args=shlex.split(tcpcmd)
	ptcpdmp=sub.Popen((args))
	time.sleep(10)
	logTo(TEST_LOG,' Selenium : Starting get '+landmark,'INFO','a')
	browser.get('http://www.'+landmark)
	time.sleep(5)
	perfData=browser.execute_script('return window.performance.timing')
	fname=EXP_DIR+'/'+'perfdata_'+landmark.split('/')[0]
	fname=fname.replace('.','-')
	pickle.dump(perfData,open(fname,'wb'))
        logTo(TEST_LOG,'Selenium : Writing done to '+EXP_DIR+'/perfdata_'+landmark,'INFO','a')
	browser.quit()
	display.stop()
	ptcpdmp.terminate()
        logTo(TEST_LOG,'Finished Selenium for '+landmark,'INFO','a')
예제 #31
0
class Order:
    def __init__(self, username, password, url):
        self.username = username
        self.password = password
        self.url = url
        self.display = Display(visible=0, size=(1920, 1080))
        self.display.start()
        self.browser = webdriver.Chrome()
        self.browser.implicitly_wait(15)

    def goToPage(self):
        self.browser.get(self.url)
        print(self.browser.title)

    def login(self):
        loginButton = self.browser.find_element_by_css_selector(
            "a#nav-link-yourAccount span.nav-line-1")
        print(loginButton.text)
        if loginButton.text == "Hello. Sign in":
            loginButton.click()
            email = self.browser.find_element_by_id("ap_email")
            pw = self.browser.find_element_by_id("ap_password")
            email.clear()
            pw.clear()
            email.send_keys(self.username)
            pw.send_keys(self.password)
            submit = self.browser.find_element_by_id("signInSubmit")
            submit.click()
        else:
            print("Already logged in.")

        loginButton = self.browser.find_element_by_css_selector(
            "a#nav-link-yourAccount span.nav-line-1")
        print(loginButton.text)

    def placeOrder(self):
        print(self.browser.title)
        print("Placing order.")
        wait = WebDriverWait(self.browser, 10)
        addToCart = self.browser.find_element_by_css_selector(
            "input#add-to-cart-button")
        addToCart.click()
        time.sleep(10)
        print(self.browser.title)
        wait.until(EC.title_contains('Amazon.com Shopping Cart'))
        checkout = self.browser.find_element_by_css_selector(
            "a#hlb-ptc-btn-native")
        checkout.click()
        time.sleep(10)
        print(self.browser.title)
        wait.until(EC.title_contains('Amazon.com Checkout'))
        placeOrder = self.browser.find_element_by_name("placeYourOrder1")
        placeOrder.click()
        time.sleep(20)
        print(self.browser.title)
        wait.until(EC.title_contains('Amazon.com Thanks You'))

    def kill(self):
        self.browser.close()
        self.display.stop()

    def start(self):
        try:
            self.goToPage()
            self.login()
            self.placeOrder()
        except Exception:
            print("Exception Raised")
            raise
        finally:
            self.kill()
예제 #32
0
class Scraper():
    """Scraper parent class, child classes are media streaming sites."""
    def __init__(self):
        """Sets creds for each instance."""
        with open('creds.json', 'r') as f:
            self.creds = json.loads(f.read())

    def start_driver(self, window_size='--window-size=1920,1080'):
        """Starts headless chrome browser/driver."""
        logging.info('starting driver')
        self.display = Display(visible=0)
        # self.display = Display(visible=0, size=(1920, 1080))
        self.display.start()

        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')  # likely necessary
        options.add_argument(window_size)
        self.driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
        self.driver.implicitly_wait(10)  # seconds

    def stop_driver(self):
        """Stops headless browser/driver."""
        logging.info('stopping driver')
        self.display.stop()
        self.driver.quit()

    def lookup_and_write_medias(self, medias, mtype):
        """Takes list of movies or shows, searches themoviedb,
           creates object to write to database, then inserts if new
           or updates timestamp if not new.
        """
        logging.info('len(medias) before take unique: {}'.format(len(medias)))
        # get unique: list of dict into list of tuples, set, back to dict
        medias = [dict(t) for t in set([tuple(d.items()) for d in medias])]
        logging.info('len(medias) after take unique: {}'.format(len(medias)))

        for m in medias:
            source_to_write = dict(self.source)

            # if media link exists, set source link, try link db lookup / update
            if 'link' in m.keys():
                source_to_write['link'] = m['link']
                full_media = flaskapp.db_lookup_via_link(m['link'])
                if full_media:
                    # logging.info(u'db media link found: {}'.format(m['title']))
                    flaskapp.update_media_with_source(full_media,
                                                      source_to_write)
                    continue

            # link url was not in database, therefore do themoviedb search
            sleep(0.2)
            year = m.get('year', '')

            results = flaskapp.themoviedb_search(m['title'], mtype, year=year)

            # exit iteration if search not complete or no results
            if 'total_results' not in results:
                logging.error(u'tmdb search not complete for {}: {} {}'.format(
                    mtype, m['title'], year))
                continue
            if results['total_results'] < 1:
                logging.warning(u'tmdb 0 results for {}: {} {}'.format(
                    mtype, m['title'], year))
                # empty media for db write, prevent re-searching
                full_media = dict()
                full_media['title'] = m['title']
                full_media['mtype'] = mtype
                full_media['year'] = year
                full_media['id'] = m['link']
                full_media['sources'] = []
            else:
                # assume top result is best match and use it
                full_media = results['results'][0]

                # append data so dict can be saved to database
                full_media['mtype'] = mtype
                full_media['sources'] = []
                if mtype == 'movie':
                    full_media['year'] = full_media['release_date'][:4]
                else:
                    full_media['title'] = full_media['name']
                    full_media['year'] = full_media['first_air_date'][:4]

                # check if titles are not exact match, in future may not append these
                if not flaskapp.doTitlesMatch(m['title'], full_media['title']):
                    logging.warning(u'not exact titles: {} | {}'.format(
                        m['title'], full_media['title']))

            # write db media if new
            flaskapp.insert_media_if_new(full_media)

            # update db media with source
            flaskapp.update_media_with_source(full_media, source_to_write)

    def update_watchlist_amz(self):
        """For watchlist items check if amazon prime and amazon pay
           are sources and add to db"""
        wl_unique = flaskapp.get_all_watchlist_in_db()
        for m in wl_unique:
            media = flaskapp.themoviedb_lookup(m['mtype'], m['id'])
            flaskapp.amz_prime_check(media)
            sleep(2.5)
            flaskapp.amz_pay_check(media)
            sleep(2.5)
예제 #33
0
# -*- coding: utf-8 -*-
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import login
display = Display(visible=0, size=(1024, 768))
display.start()


class TestVerCursoEditarNombre(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.base_url = "http://bakhan.accionstem.cl/"
        self.verificationErrors = []
        self.accept_next_alert = True

    def test_ver_curso_editar_nombre(self):
        driver = login.test_login_utp(self)
        self.assertEqual("1ro basico A 2016 Test",
                         driver.find_element_by_css_selector("font").text)
        driver.find_element_by_link_text("Ver Curso").click()
        driver.find_element_by_css_selector("button.editButton").click()
        driver.find_element_by_id("input_nombre").clear()
        driver.find_element_by_id("input_nombre").send_keys(u"prueba máil1")
        driver.find_element_by_id("button_editar").click()
예제 #34
0
def deploy_firefox(
    status_queue: Queue,
    browser_params: BrowserParamsInternal,
    manager_params: ManagerParamsInternal,
    crash_recovery: bool,
) -> Tuple[webdriver.Firefox, Path, Optional[Display]]:
    """
    launches a firefox instance with parameters set by the input dictionary
    """
    firefox_binary_path = get_firefox_binary_path()

    root_dir = os.path.dirname(__file__)  # directory of this file

    browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
    status_queue.put(("STATUS", "Profile Created", browser_profile_path))

    # Use Options instead of FirefoxProfile to set preferences since the
    # Options method has no "frozen"/restricted options.
    # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039
    fo = Options()
    # Set a custom profile that is used in-place and is not deleted by geckodriver.
    # https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html
    # Using FirefoxProfile breaks stateful crawling:
    # https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093
    fo.add_argument("-profile")
    fo.add_argument(str(browser_profile_path))

    assert browser_params.browser_id is not None
    if browser_params.seed_tar and not crash_recovery:
        logger.info("BROWSER %i: Loading initial browser profile from: %s" %
                    (browser_params.browser_id, browser_params.seed_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.seed_tar,
        )
    elif browser_params.recovery_tar:
        logger.debug("BROWSER %i: Loading recovered browser profile from: %s" %
                     (browser_params.browser_id, browser_params.recovery_tar))
        load_profile(
            browser_profile_path,
            manager_params,
            browser_params,
            browser_params.recovery_tar,
        )
    status_queue.put(("STATUS", "Profile Tar", None))

    display_mode = browser_params.display_mode
    display_pid = None
    display_port = None
    display = None
    if display_mode == "headless":
        fo.headless = True
        fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0]))
        fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1]))
    if display_mode == "xvfb":
        try:
            display = Display(visible=0, size=DEFAULT_SCREEN_RES)
            display.start()
            display_pid, display_port = display.pid, display.display
        except EasyProcessError:
            raise RuntimeError("Xvfb could not be started. \
                Please ensure it's on your path. \
                See www.X.org for full details. \
                Commonly solved on ubuntu with `sudo apt install xvfb`")
    # Must do this for all display modes,
    # because status_queue is read off no matter what.
    status_queue.put(("STATUS", "Display", (display_pid, display_port)))

    if browser_params.extension_enabled:
        # Write config file
        extension_config: Dict[str, Any] = dict()
        extension_config.update(browser_params.to_dict())
        extension_config["logger_address"] = manager_params.logger_address
        extension_config[
            "storage_controller_address"] = manager_params.storage_controller_address
        extension_config["testing"] = manager_params.testing
        ext_config_file = browser_profile_path / "browser_params.json"
        with open(ext_config_file, "w") as f:
            json.dump(extension_config, f, cls=ConfigEncoder)
        logger.debug("BROWSER %i: Saved extension config file to: %s" %
                     (browser_params.browser_id, ext_config_file))

        # TODO restore detailed logging
        # fo.set_preference("*****@*****.**", "all")

    # Geckodriver currently places the user.js file in the wrong profile
    # directory, so we have to create it manually here.
    # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when
    # to remove this workaround.
    # Load existing preferences from the profile's user.js file
    prefs = configure_firefox.load_existing_prefs(browser_profile_path)
    # Load default geckodriver preferences
    prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS)
    # Pick an available port for Marionette (https://stackoverflow.com/a/2838309)
    # This has a race condition, as another process may get the port
    # before Marionette, but we don't expect it to happen often
    s = socket.socket()
    s.bind(("", 0))
    marionette_port = s.getsockname()[1]
    s.close()
    prefs["marionette.port"] = marionette_port

    # Configure privacy settings
    configure_firefox.privacy(browser_params, prefs)

    # Set various prefs to improve speed and eliminate traffic to Mozilla
    configure_firefox.optimize_prefs(prefs)

    # Intercept logging at the Selenium level and redirect it to the
    # main logger.
    interceptor = FirefoxLogInterceptor(browser_params.browser_id)
    interceptor.start()

    # Set custom prefs. These are set after all of the default prefs to allow
    # our defaults to be overwritten.
    for name, value in browser_params.prefs.items():
        logger.info("BROWSER %i: Setting custom preference: %s = %s" %
                    (browser_params.browser_id, name, value))
        prefs[name] = value

    # Write all preferences to the profile's user.js file
    configure_firefox.save_prefs_to_profile(prefs, browser_profile_path)

    # Launch the webdriver
    status_queue.put(("STATUS", "Launch Attempted", None))
    fb = FirefoxBinary(firefox_path=firefox_binary_path)
    driver = webdriver.Firefox(
        firefox_binary=fb,
        options=fo,
        log_path=interceptor.fifo,
        # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for
        # when to remove this
        service_args=["--marionette-port",
                      str(marionette_port)],
    )

    # Add extension
    if browser_params.extension_enabled:

        # Install extension
        ext_loc = os.path.join(root_dir, "../Extension/firefox/openwpm.xpi")
        ext_loc = os.path.normpath(ext_loc)
        driver.install_addon(ext_loc, temporary=True)
        logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" %
                     browser_params.browser_id)

    # set window size
    driver.set_window_size(*DEFAULT_SCREEN_RES)

    # Get browser process pid
    if hasattr(driver, "service") and hasattr(driver.service, "process"):
        pid = driver.service.process.pid
    elif hasattr(driver, "binary") and hasattr(driver.binary, "process"):
        pid = driver.binary.process.pid
    else:
        raise RuntimeError("Unable to identify Firefox process ID.")

    status_queue.put(("STATUS", "Browser Launched", int(pid)))

    return driver, browser_profile_path, display
예제 #35
0
def selecting_data(star_name):
    '''
    Search the INES website for a specified star.

    :param star_name: name of the star (string)
    :return: request of the star name in INES page
    '''

    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(800, 600))
    display.start()

    # now Chrome will run in a virtual display.
    # you will not see the browser.

    # Starting the searching
    if os.path.isdir('iue/' + star_name) is False:
        os.mkdir('iue/' + star_name)

        folder_data = 'iue/' + star_name

        # Define global Chrome properties
        options = webdriver.ChromeOptions()
        prefs = {"download.default_directory": folder_data}
        options.add_experimental_option("prefs", prefs)

        browser = webdriver.Chrome(chrome_options=options)
        # browser = webdriver.Firefox(firefox_profile=fp)

        # Define web source
        ines_site = "http://sdc.cab.inta-csic.es/cgi-ines/IUEdbsMY"

        # Openning it
        browser.get(ines_site)
        # browser.maximize_window()

        # Selecting all data
        mySelect = Select(browser.find_element_by_name("limit"))
        mySelect.select_by_value("all")
        time.sleep(3)

        # Selecting some stars
        browser.find_element_by_name("object").send_keys(star_name)
        browser.find_element_by_name(".submit").click()
        # time.sleep(3)

        # Taking the data
        try:
            browser.find_element_by_name("markRebin").click()
            browser.find_element_by_name(".submitNH").click()
            time.sleep(10)
        except:
            print('There is no data for this star!')
        # browser.close()

        # Unzip files
        outdir = os.getcwd()
        os.chdir(folder_data)
        file_list = glob('*')
        if len(file_list) != 0:
            # print(file_list)
            fname = str(file_list[0])
            # print(fname)
            tar = tarfile.open(fname, "r:gz")
            tar.extractall()
            tar.close()
            os.system('rm *.gz')
        os.chdir(outdir)
        browser.close()

    return
예제 #36
0
class BrowserWebdriver(BrowserBase):
    skip_urls = []

    def __init__(self, *args, **kwargs):
        BrowserBase.__init__(self, *args, **kwargs)
        self._first_navigation_ts = None
        self._first_navigation_netloc = None
        self._ts_offset = None

    def _skip_url(self, page, url):
        if not url:
            return False

        _, req_netloc, _ = parse_url(url)

        for su in self.skip_urls:
            if su in req_netloc:
                _, page_netloc, _ = parse_url(page.url)
                if not any(x in page_netloc for x in self.skip_urls):
                    self.log_debug("skipping URL %s" % req_netloc)
                    return True
        return False

    def _browser_clear_caches(self):
        BrowserBase._browser_clear_caches(self)
        self.driver.quit()
        self.pid = self.browser_start()

    def _browser_navigate(self, location, cached=True, name=None):
        url = location.url if isinstance(location, Page) else location
        real_navigation = self._http_get(url)
        return Page(self,
                    url,
                    cached,
                    name=name,
                    real_navigation=real_navigation)

    def _browser_wait(self, page, timeout=None):

        self.log_info("_browser_wait()...")

        if timeout is None:
            timeout = self.nav_timeout

        start = time.time()
        while time.time() - start < timeout / 2:
            time.sleep(0.2)
            if self.driver.execute_script(
                    "return window.performance.timing.loadEventEnd"):
                break
            # onload event has not been processed yet, so need to wait and retry
            self.log_info("Waiting for loadEventEnd ... ")

        while time.time() - start < timeout:
            time.sleep(self.ajax_threshold)

            # hack. Execute something in browser context to flush logs...
            self.driver.execute_script(
                "return window.performance.timing.loadEventEnd")

            self._browser_get_events(page)

            ir = page.get_incomplete_reqs()
            if not ir:
                break
            self.log_info(
                "Waiting for incomplete requests:\n    %s" %
                ("\n    ".join(["%s - %s" % (r.id, r.url) for r in ir])))

        if time.time() - start >= timeout:
            if not self.driver.execute_script(
                    "return window.performance.timing.loadEventEnd"):
                self.log_error(
                    "Page '%s' load timeout, window.performance.timing.loadEventEnd = 0"
                    % page.url)

            ir = page.get_incomplete_reqs()
            if ir:
                self.log_error(
                    "Can't wait for page '%s' load completion, "
                    "see '%s' for details\nincomplete requests:\n    %s" %
                    (page.url, self.log_path, "\n    ".join(
                        ["%s - %s" % (r.id, r.url) for r in ir])))

        page.complete(self)

    def _browser_warmup_page(self, location, name=None):
        self.navigate_to(location, cached=False, stats=False, name=name)

    def _browser_display_init(self, headless, resolution):
        if headless:
            try:
                from pyvirtualdisplay import Display
            except ImportError as e:
                abort(e)
            self.display = Display(visible=0, size=resolution)
            self.display.start()
        else:
            self.display = None

    def _browser_execute_script(self, js):
        val = self.driver.execute_script("return %s" % js)
        self.log_debug("%s = %s" % (js, val))
        return val

    def browser_get_name(self):
        c = self.driver.capabilities
        return c['browserName']

    def browser_get_version(self):
        c = self.driver.capabilities
        return self._get_val(c, ['version', 'browserVersion'])

    def browser_get_platform(self):
        c = self.driver.capabilities
        return self._get_val(c, ['platform', 'platformName'])

    def browser_get_screenshot_as_file(self, filename):
        self.driver.get_screenshot_as_file(filename)

    def browser_get_page_timeline(self, page):

        values = {}
        for t in PageTimeline.types:
            if t in PageTimeline.jstypes:
                js = "window.performance.timing.%s" % PageTimeline.jstypes[t]
                values[t] = self._browser_execute_script(js)

        return PageTimeline(page, values)

#    def browser_set_session(self, domain, session_id):
#        self._http_get(domain)
#        self.driver.add_cookie({'name': 'sessionid', 'value': session_id})

    def browser_get_current_url(self):
        return self.driver.current_url

    def browser_get_screenshot(self, filename):
        self.driver.get_screenshot_as_file(filename)

    def browser_stop(self):
        try:
            if self.driver:
                self.driver.quit()
                self.driver = None
            if self.display:
                self.display.stop()
                self.display = None
        except URLError:
            pass

    def _xpath_click(self, xpath):
        exc = None

        # take into account possible replacements of %23/#
        xpaths = [xpath]
        if "%23" in xpath:
            xpaths.append(xpath.replace("%23", "#"))
        if "#" in xpath:
            xpaths.append(xpath.replace("#", "%23"))

        for x in xpaths:
            self.log_debug("Looking for xpath: %s ..." % x)
            try:
                el = self.driver.find_element_by_xpath(x)
                el.click()
                self.log_debug("Looking for xpath: %s ... OK" % x)
                return
            except NoSuchElementException as e:
                self.log_debug(
                    "Looking for xpath: %s ... Failed, no such element" % x)
                exc = e
            except ElementNotVisibleException as e:
                self.log_warning(
                    "Looking for xpath: %s ... Failed, element not visible" %
                    x)
                exc = e

        self.log_error("NoSuchElementException, xpath: %s, see debug log" %
                       xpath)
        self.log_debug("page source:\n%s" %
                       self.driver.page_source.encode('ascii', 'ignore'))
        raise BrowserExc(e)

    def _http_get(self, url, validator=None):
        self.log_debug("Execute GET request: %s" % url)

        if not self._first_navigation_ts:
            self._first_navigation_ts = time.time()
            _, self._first_navigation_netloc, _ = parse_url(url)

        ar = url.split("^")
        if len(ar) > 1:
            self._xpath_click(ar[1])
            return False

        try:
            self.driver.get(url)
        except WebDriverException as e:
            raise BrowserExc(e)
        return True

    @staticmethod
    def _get_val(d, keys):
        for key in keys:
            if key in d:
                return d[key]
        return "unknown"

    def print_browser_info(self):
        c = self.driver.capabilities
        self.print_stats_title("Browser summary")
        print("  - platform: %s" % self.browser_get_platform())
        print("  - browser:  %s %s" %
              (self.browser_get_name(), self.browser_get_version()))
        print("  - PID:      %d" % self.pid)
        print("  - log file: %s" % self.log_path)

    def print_log_file_path(self):
        self.print_stats_title("Browser log file")
        print("  %s" % self.log_path)

    # === virtual methods that must be implemented in every webdriver-based browser === #

    def _browser_parse_logs(self, page, logs):
        raise BrowserExcNotImplemented()

    def _browser_get_events(self, page):
        raise BrowserExcNotImplemented()

    # === webdriver specific === #

    def dom_wait_element_stale(self, el, timeout_s=None, name=None):
        start_time = time.time()

        if timeout_s is None:
            timeout_s = self.nav_timeout

        # http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html
        while time.time() < start_time + timeout_s:
            try:
                el.find_elements_by_id('doesnt-matter')
                pass
            except StaleElementReferenceException:
                break
            time.sleep(0.1)

        if time.time() > start_time + timeout_s:
            msg = "DOM element '%s' click() timeout: %.1fs" % (
                name, time.time() - start_time)
            self.log_error(msg)
            raise BrowserExcTimeout(msg)

    def dom_click(self,
                  el,
                  timeout_s=None,
                  name=None,
                  wait_callback=None,
                  wait_callback_obj=None):
        self.log_debug("dom_click(%s, %s)" % (str(el), str(name)))

        if timeout_s is None:
            timeout_s = self.nav_timeout

        p = Page(self,
                 self.browser_get_current_url(),
                 True,
                 name=name,
                 real_navigation=False)
        p.start()

        # 1. click on the element

        old_page = self.driver.find_element_by_tag_name('html')
        el.click()

        # 2. wait for selenium onclick completion

        if wait_callback:
            self.log_debug(
                "wait callback: %s, %s" %
                (str(wait_callback.__name__), str(wait_callback_obj)))
            wait_callback(wait_callback_obj, el, timeout_s, name)
        else:
            self.log_debug("wait stale: %s, %s, %s" % (el, timeout_s, name))
            self.dom_wait_element_stale(el, timeout_s, name)

        # 3. wait for ajax completion, because browser URL can be update only after that

        self._browser_wait(p, timeout=timeout_s)
        p.url = self.browser_get_current_url()

        time.sleep(0.2)

    def dom_find_element_by_id(self, id):
        try:
            return self.driver.find_element_by_id(id)
        except NoSuchElementException as e:
            raise BrowserExc(e)

    def dom_find_element_by_name(self, name):
        try:
            return self.driver.find_element_by_name(name)
        except NoSuchElementException as e:
            raise BrowserExc(e)

    def dom_find_element_by_xpath(self, xpath):
        try:
            return self.driver.find_element_by_xpath(xpath)
        except NoSuchElementException as e:
            raise BrowserExc(e)

    def dom_find_frames(self):
        frames = []
        for name in ("frame", "iframe"):
            try:
                frames += self.driver.find_elements_by_tag_name(name)
            except NoSuchElementException as e:
                pass
        return frames

    def dom_switch_to_frame(self, frame):
        self.log_info("Switching to frame %s" % frame)
        return self.driver.switch_to.frame(frame)

    def dom_switch_to_default_content(self):
        self.log_info("Switching to default content")
        return self.driver.switch_to.default_content()

    def dom_send_keys(self, el, keys):
        val = el.get_attribute('value')
        if val != '':  # clear initial value
            self.log_info("Element value is not empty, clear content...")
            self.driver.execute_script("arguments[0].value = ''", el)
            time.sleep(2.0)

        for ch in keys:
            el.send_keys(ch)
            time.sleep(0.2)
        val = el.get_attribute('value')
        if val == keys:
            return True

        self.log_warning("Bogus selenium send_keys(). Entered: '%s', "
                         "but see: '%s', using set_attribute()..." %
                         (keys, val))
        time.sleep(2.0)
        self.driver.execute_script("arguments[0].value = '%s'" % keys, el)
        time.sleep(2.0)
        val = el.get_attribute('value')
        if val == keys:
            self.log_info("Ok, set_attribute() works fine")
            return True

        self.log_error(
            "Bogus selenium send_keys() and set_attribute(), can't enter value into the element"
        )
        return False

    # === some predefined scenarios === #

    def _do_send_keys(self, title, keys, tag_names, tag_ids):
        for tag, name in tag_names:
            try:
                el = self.dom_find_element_by_name(name)
                if el.tag_name != tag:
                    continue
                if not self.dom_send_keys(el, keys):
                    self.log_error("Couldn't enter %s" % title)
                    return False
                return True
            except BrowserExc as e:
                pass
        for tag, name in tag_names:
            try:
                el = self.dom_find_element_by_xpath(
                    '//*[@label="{}"]'.format(name))
                if el.tag_name != tag:
                    continue
                if not self.dom_send_keys(el, keys):
                    self.log_error("Couldn't enter %s" % title)
                    return False
                return True
            except BrowserExc as e:
                pass

        for tag, id in tag_ids:
            try:
                el = self.dom_find_element_by_id(id)
                if el.tag_name != tag:
                    continue
                if not self.dom_send_keys(el, keys):
                    self.log_error("Couldn't enter %s" % title)
                    return False
                return True
            except BrowserExc as e:
                pass

        self.log_info("Couldn't find %s input field" % title)
        return False

    def _do_login(self, url, user, password, login_form, timeout_s=None):
        if not self._do_send_keys('user name', user, login_form.user_tags,
                                  login_form.user_ids):
            return False

        time.sleep(1)

        if not self._do_send_keys('password', password, login_form.pass_tags,
                                  login_form.pass_ids):
            return False

        time.sleep(1)

        submit_form_found = False
        for tag, name in login_form.sbmt_tags:
            try:
                el = self.dom_find_element_by_name(name)
                if el.tag_name != tag:
                    continue
                submit_form_found = True
                self.dom_click(el, name=name, timeout_s=timeout_s)

                try:
                    el = self.dom_find_element_by_name(name)
                except BrowserExc:
                    self.log_info("Login succeed")
                    return True

            except BrowserExc as e:
                pass

        for tag, id in login_form.sbmt_ids:
            try:
                el = self.dom_find_element_by_id(id)
                if el.tag_name != tag:
                    continue
                submit_form_found = True
                self.dom_click(el, name=id, timeout_s=timeout_s)

                try:
                    el = self.dom_find_element_by_id(id)
                except BrowserExc:
                    self.log_info("Login succeed")
                    return True

            except BrowserExc as e:
                pass

        for x in login_form.sbmt_xpath:
            try:
                el = self.dom_find_element_by_xpath(x)
                submit_form_found = True
                self.dom_click(el, name=id, timeout_s=timeout_s)

                try:
                    el = self.dom_find_element_by_xpath(x)
                except BrowserExc:
                    self.log_info("Login succeed")
                    return True

            except BrowserExc as e:
                pass

        if not submit_form_found:
            self.log_info("Couldn't find login submit form")

        self.log_info("Login failed")
        return False

    def do_login(self, url, user, password, login_form, timeout_s=None):
        self.log_info("Trying to login to '%s' under user %s" % (url, user))
        self.navigate_to(url, cached=None)

        if self._do_login(url, user, password, login_form,
                          timeout_s=timeout_s):
            return True

        for frame in self.dom_find_frames():
            self.dom_switch_to_frame(frame)
            if self._do_login(url,
                              user,
                              password,
                              login_form,
                              timeout_s=timeout_s):
                return True

        self.log_info("Login to '%s' under user '%s' has been failed" %
                      (url, user))
        return False
예제 #37
0
class SlackSpider():
    def __init__(self):
        #self.all_items = []
        self.channelList = []
        self.dataList = []
        self.pageSize = 0
        self.urlsToHit = []
        self.TeamName = ''
        self.ChannelName = ''

    # Open headless chromedriver
    def start_driver(self):
        print('starting driver...')
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")
        sleep(randint(9, 10))

    # Close chromedriver
    def close_driver(self):
        print('closing driver...')
        self.display.stop()
        self.driver.quit()
        print('closed!')

    # Tell the browser to get a page
    def get_page(self, url):
        print('getting page...{0}'.format(url))
        self.driver.get(url)
        sleep(randint(9, 10))

    # Grab items from divisions
    def grab_list_items(self):
        print('grabbing list of items...')
        senderAvatar = ''
        all_items = []
        for div in self.driver.find_elements_by_xpath(
                '//ul[@class="messages"]//li'):
            data = self.process_elements(div, senderAvatar)

            if data:
                all_items.append(data)
                if data.senderAvatar != '':
                    senderAvatar = data.senderAvatar
        return all_items

    # Process division elements
    def process_elements(self, div, senderAvatar):
        msg_sender_avatar = ''
        try:
            msg_sender = div.find_element_by_class_name(
                "msg-user").get_attribute('innerText')
            msg_time = div.find_element_by_class_name(
                "msg-time").get_attribute('innerText')
            msg_body = div.find_element_by_class_name(
                "msg-body").get_attribute('innerText')
        except Exception as error:
            print 'element not found exception'
            return None

        try:
            avatar = div.find_element_by_xpath('.//*[@class="msg-avatar"]')
            msg_sender_avatar = avatar.find_element_by_class_name(
                'msg-thumb').get_attribute('src')
        except Exception as error:
            msg_sender_avatar = senderAvatar

        if msg_sender and msg_time and msg_body:
            archiveObj = SlackArchive()
            archiveObj.teamName = self.TeamName
            archiveObj.channelName = self.ChannelName
            archiveObj.messageBody = msg_body
            archiveObj.senderAvatar = msg_sender_avatar
            archiveObj.messageTime = msg_time
            archiveObj.messageSender = msg_sender
            return archiveObj

        else:
            return None

    # Parse the URL
    def parse(self, url):
        self.get_page(url)
        return self.grab_list_items()
        pass

    # Get list of channels in a team
    def getChannelList(self):
        for channelName in self.driver.find_elements_by_xpath(
                '//ul[@class="channels-list"]//li//a'):
            self.channelList.append(channelName.text)
        pass

    # Get the total number of pages in each channel in each page
    def getPageSize(self, url_Template):
        for page in self.driver.find_elements_by_xpath(
                '//ul[@class="pagination pagination-vertical"]//li[@class="page-item active"]'
        ):
            self.pageSize = int(page.text)
        pass

    # Build the list of URL's to hit
    def buildTarget(self, teamName):
        url_Template = "https://{0}.slackarchive.io/".format(teamName)
        self.get_page(url_Template)
        self.getChannelList()
        if teamName == 'buffercommunity':
            self.channelList = self.channelList[7:]
        for channel in self.channelList:
            channelName = channel[1:].strip()
            urlA = url_Template + channelName + "/"
            self.get_page(urlA)
            self.getPageSize(urlA)
            print 'Page size: {0}'.format(self.pageSize)
            for i in range(1, self.pageSize + 1):
                urlObject = []
                urlObject.append(teamName)
                urlObject.append(channelName)
                urlObject.append(urlA + "page-" + str(i))
                self.urlsToHit.append(urlObject)
        pass

    # Run the crawler
    def runSpider(self, teamName):

        self.buildTarget(teamName)
        Utils.get_Connection_SNA4Slack()
        sync_table(SlackArchive)

        for url in self.urlsToHit:
            self.TeamName = url[0]
            self.ChannelName = url[1]
            count = 0
            for data in self.parse(url[2]):
                if data:
                    count += 1
                    node_object = SlackArchive(
                        id=uuid.uuid1(),
                        teamName=data.teamName,
                        channelName=data.channelName,
                        messageSender=data.messageSender.rstrip().lstrip(),
                        messageBody=data.messageBody.rstrip().lstrip(),
                        senderAvatar=data.senderAvatar,
                        messageTime=dateutil.parser.parse(data.messageTime))
                    node_object.save()
            if count > 0:
                print '{0} rows saved'.format(count)

            else:
                print url[2]
                print 'No data found'

    pass
예제 #38
0
class LexisNexisSpider(scrapy.Spider):
    name = 'lexisnexis'
    start_urls = []

    s_date = ''
    e_date = ''
    c_date = ''
    page_cnt = 1
    dont_filter = True
    agency_list = []
    '''
    today = datetime.now() + timedelta(days = -3)
    date = str(today)[0:10]
    year = date[0:4]
    month = date[5:7]
    day = date[8:10]
    '''
    '''
    Constructor
    '''
    def __init__(self, keyword='nation', *args, **kwargs):
        self.keyword = keyword
        self.start_urls = ['http://www.google.com']
        super(LexisNexisSpider, self).__init__(*args, **kwargs)

        self.display = Display(visible=0, size=(1280, 1024))
        self.display.start()
        profile = webdriver.FirefoxProfile()
        profile.native_events_enabled = True
        self.driver = webdriver.Firefox(profile)
        # self.driver2 = webdriver.Firefox(profile)
        self.driver.get(self.get_query_url(self.keyword))
        time.sleep(3)

    def __del__(self):

        self.driver.close()
        self.driver.quit()
        self.display.stop()
        print '************************************************************************'
        print 'CLOSED!!!'

    '''
    Get the query url
    '''

    def get_query_url(self, keyword):
        today = datetime.now() + timedelta(days=-25)
        date = str(today)[0:10]
        year = date[0:4]
        month = date[5:7]
        day = date[8:10]
        return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28' + month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075%2C11810%2C306884%2C247189%2C163823%2C301477&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true'
        '''
					#The New York Times
					+'%2C6742' \
					# USA TODAY
					+'%2C8213' \
					#Wall Street Journal Abstracts
					+'%2C8142' \
					#The Washington Post
					+'%2C8075' \
					#Post-Dispatch
					+'%2C11810' \
					#The Baltimore Sun
					+'%2C306884' \
					#The Philadelphia Inquirer
					+'%2C247189' \
					#Chicago Daily Herald
                    +'%2c163823'
					#Arizona Capitol Times
                    +'%2c301477'
		'''
        #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28'+ month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true'
        #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%284/5/2011%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true'

    def next_page(self, start_index):
        try:
            next_button = self.driver.find_element_by_xpath(
                '//table//table//table//table//table//table//td[@align="right"]/a/img[@src="images/IconPaginationNext.gif"]'
            )
        except:
            return False
            pass
        risb = self.driver.find_element_by_xpath(
            '//input[@name="risb"]').get_attribute("value")
        nexpage = "http://www.lexisnexis.com/lnacui2api/results/listview/listview.do?start=" + str(
            start_index) + "&sort=RELEVANCE&format=GNBLIST&risb=" + risb
        self.driver.get(nexpage)
        time.sleep(2)
        source = self.driver.find_element_by_xpath(
            '//frame[@title="Results Content Frame"]')
        self.driver.get(source.get_attribute("src"))
        time.sleep(2)
        return True

    '''
    Starting point
    Retrieve the news link from the list of search results.
    Args:
     response - the response object pertaining to the search results page
    '''

    def parse(self, response):
        button_continue = self.driver.find_element_by_xpath(
            '//a[@id="firstbtn"]')
        try:
            button_continue.click()
        except:
            print 'can' 't find continue button '
        source = self.driver.find_element_by_xpath(
            '//frame[@title="Results Content Frame"]')
        self.driver.get(source.get_attribute("src"))
        time.sleep(5)
        item_list = list()
        start_id = 1
        while self.next_page(start_id):
            noshade_list = self.driver.find_elements_by_xpath(
                '//tr[@class="noshaderow1st"]')
            shade_list = self.driver.find_elements_by_xpath(
                '//tr[@class="shaderow1st"]')
            for news in noshade_list + shade_list:
                button = news.find_element_by_xpath('.//a')
                news_title = button.text
                news_url = button.get_attribute("href")
                news_agency = news.find_element_by_xpath(
                    './/span[@class="notranslate"]').text

                article = LexisnexisArticleItem()
                article['title'] = news_title
                article['url'] = news_url
                article['agency'] = news_agency
                item_list.append(article)
            start_id += 25
            print "++++++++++++++++++", len(item_list)
        for article in item_list:
            self.driver.get(article['url'])
            time.sleep(2)
            try:
                source = self.driver.find_element_by_xpath(
                    '//frame[@title="Results Document Content Frame"]')
                self.driver.get(source.get_attribute('src'))
                time.sleep(2)
                date_str = self.driver.find_element_by_xpath(
                    '//span[@class="verdana"]/center').text
                news_date = self.parse_date(date_str)

                news_id = self.driver.find_element_by_xpath(
                    '//input[@name="docIdentifier"]')
                news_id = news_id.get_attribute('value')

                news_content_list = self.driver.find_elements_by_xpath(
                    '//span[@class="verdana"]/p[@class="loose"]')
                news_content_list = [n.text for n in news_content_list]
                news_content = '.'.join(news_content_list)

                #Get keywords
                rake = Rake()
                keywords_list = rake.run(news_content)
                keywords = '\n'.join(keywords_list)
                tag = rake.get_tagged_text()

                #article['keywords'] = keywords
                article['aid'] = news_id
                article['date'] = news_date
                article['contents'] = news_content
                article['keywords'] = keywords
                article['tagged_text'] = tag
            except Exception, e:
                print 'ERROR!!!!!!!!!!!!!  URL :'
                print traceback.print_exc(file=sys.stdout)

            yield article
예제 #39
0
 def __init__(self):
     display = Display(visible=0, size=(1120, 600))
     display.start()
     self.driver = webdriver.Chrome()
     self.url = 'https://edit.yahoo.com/forgot?stage=fe100'
예제 #40
0
def display(request):
    """Logged in user session for [email protected] """
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(1920, 1080))
    display.start()
예제 #41
0
 def selenium_browser(self):
     browser = str(os.getenv('BROWSER', None))
     if browser == "HEADLESS":
         display = Display(visible=0, size=(800, 600))
         display.start()
     return webdriver.Chrome(chrome_options=self.chrome_options)
예제 #42
0
def get_urls(query, url, verbose=False, warning=True, user_agent=None, proxy=None, **kwargs):
    """
      Bypass Google captchas and Google API by using selenium-webdriver to gather
      the Google URL. This will open a robot controlled browser window and attempt
      to get a URL from Google that will be used for scraping afterwards.

      Only downside to this method is that your IP and user agent will be visible
      until the application pulls the URL.
    """
    if verbose:
        logger.debug(set_color(
            "setting up the virtual display to hide the browser...", level=10
        ))
    ff_display = Display(visible=0, size=(800, 600))
    ff_display.start()
    logger.info(set_color(
        "firefox browser display will be hidden while it performs the query..."
    ))
    if warning:
        logger.warning(set_color(
            "your web browser will be automated in order for Zeus to successfully "
            "bypass captchas and API calls. this is done in order to grab the URL "
            "from the search and parse the results. please give selenium time to "
            "finish it's task...", level=30
        ))
    if verbose:
        logger.debug(set_color(
            "running selenium-webdriver and launching browser...", level=10
        ))

    if verbose:
        logger.debug(set_color(
            "adjusting selenium-webdriver user-agent to '{}'...".format(user_agent), level=10
        ))
    if proxy is not None:
        proxy_type = proxy.keys()
        proxy_to_use = Proxy({
            "proxyType": ProxyType.MANUAL,
            "httpProxy": proxy[proxy_type[0]],
            "ftpProxy": proxy[proxy_type[0]],
            "sslProxy": proxy[proxy_type[0]],
            "noProxy": ""
        })
        if verbose:
            logger.debug(set_color(
                "setting selenium proxy to '{}'...".format(
                    ''.join(proxy_type) + "://" + ''.join(proxy.values())
                ), level=10
            ))
    else:
        proxy_to_use = None

    profile = webdriver.FirefoxProfile()
    profile.set_preference("general.useragent.override", user_agent)
    browser = webdriver.Firefox(profile, proxy=proxy_to_use)
    logger.info(set_color("browser will open shortly..."))
    browser.get(url)
    if verbose:
        logger.debug(set_color(
            "searching search engine for the 'q' element (search button)...", level=10
        ))
    search = browser.find_element_by_name('q')
    logger.info(set_color(
        "searching '{}' using query '{}'...".format(url, query)
    ))
    search.send_keys(query)
    search.send_keys(Keys.RETURN)  # hit return after you enter search text
    time.sleep(3)
    if verbose:
        logger.debug(set_color(
            "obtaining URL from selenium..."
        ))
    retval = browser.current_url
    if verbose:
        logger.debug(set_color(
            "found current URL from selenium browser '{}'...".format(retval), level=10
        ))
    logger.info(set_color(
        "closing the browser and continuing process.."
    ))
    browser.close()
    ff_display.stop()
    return retval
def AdjustResolution():
    display = Display(visible=0, size=(800, 800))
    display.start()
예제 #44
0
파일: Scraper.py 프로젝트: jmozah/scarface
class PinterestImages():
    def __init__(self):
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.srchurl = 'https://in.pinterest.com/search/pins/?q=%s'
        self.base_url = self.srchurl
        self.path_to_chromedriver = './chromedriver'
        self.browser = webdriver.Chrome(
            executable_path=self.path_to_chromedriver)
        self.browser = webdriver.Chrome()
        self.browser.get('https://in.pinterest.com/login/')
        self.elem = self.browser.find_elements_by_name("username_or_email")
        self.elem[0].send_keys("*****@*****.**")
        self.elem = self.browser.find_elements_by_name("password")
        self.elem[0].send_keys("qawsedrf")
        self.elem = self.browser.find_elements_by_xpath(
            "/html/body/div[1]/div[1]/div[1]/div/div/div/form/div[4]/div/button"
        )
        self.elem[0].click()

        self.buton = '//*[@id="yui_3_5_1_1_1440135195051_1805"]'

    def crawl(self, qry):
        def noImages(psource):
            if psource == None:
                return 0
            soup = BeautifulSoup(psource, 'lxml')
            imgs = soup.findAll('div', 'Image Module pinUiImage')
            return len(imgs)

        url = self.base_url % ('+'.join(qry))
        self.browser.get(url)
        time.sleep(1)
        pps = None
        cps = None
        for i in range(1, 20):
            self.browser.execute_script("window.scrollTo(0, %d);" %
                                        (i * 10000))
            time.sleep(10)
            cps = self.browser.page_source
            if noImages(cps) < noImages(pps):
                break
            pps = cps

        pagesource = pps

        soup = BeautifulSoup(pagesource, 'lxml')
        imgs = soup.findAll('div', 'Image Module pinUiImage')
        extractedUrls = []
        for img in imgs:
            imgd = img.findAll('img')
            url = imgd[0]['src']
            title = imgd[0]['alt'].encode('ascii', 'ignore')
            extractedUrls.append(url.replace('236x', '736x') + '\t' + title)

        with open('_'.join(sys.argv[1:]) + '_Pinterest', 'w') as outfile:
            for x in extractedUrls:
                outfile.write(x + '\n')

    def stop(self):
        self.browser.quit()
        self.display.stop()
예제 #45
0
class Scrape:
    def __init__(self):

        self.site_url = 'https://www.iextrading.com/apps/tops/'
        self.csv_file = None
        if platform == 'darwin':  # OSX
            self.driver = webdriver.Chrome()
        elif platform == 'linux' or platform == 'linux2':  # headless
            self.display = Display(visible=0, size=(800, 600))
            self.display.start()
            self.driver = webdriver.Chrome()
        else:  # windows
            self.driver = webdriver.Chrome('chromedriver.exe')

    def scrape_soup(self, r):

        self.driver.get(self.site_url)

        if r:
            self._select_report(r)
            self.csv_file = "{}_{}.csv".format(
                time.strftime("%Y%m%d%H%M", time.localtime()), r)
        else:
            self.csv_file = "{}_{}.csv".format(
                time.strftime("%Y%m%d%H%M", time.localtime()), 'top')

        for _ in range(30):
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            table = soup.find("table", {"id": "quotesTable"})
            table_ = self._scrape_table(table)
            if table_:
                logger.info('Table scraped! ')
                break
            else:
                time.sleep(1)
        else:
            logger.info(
                'Is the site live? Found placeholders on page. Scrape either way'
            )
            table_ = self._scrape_table(table, True)

        self._write_row([
            'No', 'Ticker', 'Mkt %', 'Shares', 'Bid Quantity', 'Bid Price',
            'Ask Price', 'Ask Quantity', 'Last Sale Price',
            'Last Sale Quantity'
        ])

        table_.pop(0)  # off header
        for wr in table_:
            self._write_row(wr)

    @staticmethod
    def _scrape_table(table, force=False):

        rows = table.findAll("tr")

        tbl_data = []
        for row in rows:
            tds_ = row.find_all("td")
            row_data = []
            for ctr, td in enumerate(tds_):
                if td.text.strip().count(
                        '-') and not td.text.strip().count('--:--:'):
                    if not force:
                        logger.info('Table not ready for scraping')
                        return False
                # drop time
                if ctr in [1, 6]:
                    d = td.text.strip()[:-8]
                else:
                    d = td.text.strip()
                # split x
                if d.count(u'\xd7'):
                    dsplit = d.split(u'\xd7')
                    row_data.append(dsplit[0].strip())
                    row_data.append(dsplit[1].strip())
                elif d.count('%'):
                    row_data.append(d.rstrip('%'))
                else:
                    row_data.append(d)
            tbl_data.append(row_data)
            logger.info('Scraped row data: {}'.format(row_data))
        return tbl_data

    def _write_row(self, row):
        with open(self.csv_file, 'ab') as hlr:
            wrt = csv.writer(hlr,
                             delimiter=',',
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL)
            wrt.writerow(row)
            logger.info('Add row: {}'.format(row))

    def tear_down(self):
        if self.driver:
            self.driver.quit()

    def stdout_options(self):

        self.driver.get(self.site_url)

        try:
            o = []
            op_el = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "lists")))
            select = Select(op_el)
            opts_ = select.options
            for opt_ in opts_:
                option = opt_.get_attribute('value')
                logger.info('report: {}'.format(option))
                o.append(option)
            return o
        except TimeoutException:
            logger.error(
                'Is the page live? Timed out on the reports select element')

    def _select_report(self, value):

        try:
            op_el = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "lists")))
            select = Select(op_el)
            select.select_by_value(value)
        except TimeoutException:
            logger.error(
                'Is the page live? Timed out on the reports select element')
예제 #46
0
class BaseCase(unittest.TestCase):
    '''
    A base test case that wraps methods for enhanced usage.
    You can also add your own methods here.
    '''

    def __init__(self, *args, **kwargs):
        super(BaseCase, self).__init__(*args, **kwargs)
        try:
            self.driver = WebDriver()
        except Exception:
            pass
        self.environment = None

    def open(self, url):
        self.driver.get(url)
        if settings.WAIT_FOR_RSC_ON_PAGE_LOADS:
            self.wait_for_ready_state_complete()
        self._demo_mode_pause_if_active()

    def open_url(self, url):
        """ In case people are mixing up self.open() with open(),
            use this alternative. """
        self.open(url)

    def click(self, selector, by=By.CSS_SELECTOR,
              timeout=settings.SMALL_TIMEOUT):
        element = page_actions.wait_for_element_visible(
            self.driver, selector, by, timeout=timeout)
        self._demo_mode_scroll_if_active(selector, by)
        element.click()
        if settings.WAIT_FOR_RSC_ON_CLICKS:
            self.wait_for_ready_state_complete()
        self._demo_mode_pause_if_active()

    def click_chain(self, selectors_list, by=By.CSS_SELECTOR,
                    timeout=settings.SMALL_TIMEOUT, spacing=0):
        """ This method clicks on a list of elements in succession.
            'spacing' is the amount of time to wait between clicks. (sec) """
        for selector in selectors_list:
            self.click(selector, by=by, timeout=timeout)
            if spacing > 0:
                time.sleep(spacing)

    def click_link_text(self, link_text, timeout=settings.SMALL_TIMEOUT):
        element = self.wait_for_link_text_visible(link_text, timeout=timeout)
        element.click()
        if settings.WAIT_FOR_RSC_ON_CLICKS:
            self.wait_for_ready_state_complete()
        self._demo_mode_pause_if_active()

    def add_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT):
        """ The more-reliable version of driver.send_keys()
            Similar to update_text(), but won't clear the text field first. """
        element = self.wait_for_element_visible(selector, timeout=timeout)
        element.send_keys(new_value)
        self._demo_mode_pause_if_active()

    def send_keys(self, selector, new_value, timeout=settings.SMALL_TIMEOUT):
        """ Same as add_text() -> more reliable, but less name confusion. """
        self.add_text(selector, new_value, timeout=timeout)

    def update_text_value(self, selector, new_value,
                          timeout=settings.SMALL_TIMEOUT, retry=False):
        """ This method updates an element's text value with a new value.
            @Params
            selector - the selector with the value to update
            new_value - the new value for setting the text field
            timeout - how long to wait for the selector to be visible
            retry - if True, use jquery if the selenium text update fails
        """
        element = self.wait_for_element_visible(selector, timeout=timeout)
        element.clear()
        self._demo_mode_pause_if_active(tiny=True)
        element.send_keys(new_value)
        if (retry and element.get_attribute('value') != new_value and (
                not new_value.endswith('\n'))):
            logging.debug('update_text_value is falling back to jQuery!')
            selector = self.jq_format(selector)
            self.set_value(selector, new_value)
        self._demo_mode_pause_if_active()

    def update_text(self, selector, new_value,
                    timeout=settings.SMALL_TIMEOUT, retry=False):
        """ The shorter version of update_text_value(), which
            clears existing text and adds new text into the text field.
            We want to keep the old version for backward compatibility. """
        self.update_text_value(selector, new_value,
                               timeout=timeout, retry=retry)

    def is_element_present(self, selector, by=By.CSS_SELECTOR):
        return page_actions.is_element_present(self.driver, selector, by)

    def is_element_visible(self, selector, by=By.CSS_SELECTOR):
        return page_actions.is_element_visible(self.driver, selector, by)

    def is_link_text_visible(self, link_text):
        return page_actions.is_element_visible(self.driver, link_text,
                                               by=By.LINK_TEXT)

    def is_text_visible(self, text, selector, by=By.CSS_SELECTOR):
        return page_actions.is_text_visible(self.driver, text, selector, by)

    def find_visible_elements(self, selector, by=By.CSS_SELECTOR):
        return page_actions.find_visible_elements(self.driver, selector, by)

    def execute_script(self, script):
        return self.driver.execute_script(script)

    def set_window_size(self, width, height):
        return self.driver.set_window_size(width, height)
        self._demo_mode_pause_if_active()

    def maximize_window(self):
        return self.driver.maximize_window()
        self._demo_mode_pause_if_active()

    def activate_jquery(self):
        """ If "jQuery is not defined", use this method to activate it for use.
            This happens because jQuery is not always defined on web sites. """
        try:
            # Let's first find out if jQuery is already defined.
            self.driver.execute_script("jQuery('html')")
            # Since that command worked, jQuery is defined. Let's return.
            return
        except Exception:
            # jQuery is not currently defined. Let's proceed by defining it.
            pass
        self.driver.execute_script(
            '''var script = document.createElement("script"); '''
            '''script.src = "https://ajax.googleapis.com/ajax/libs/jquery/1/'''
            '''jquery.min.js"; document.getElementsByTagName("head")[0]'''
            '''.appendChild(script);''')
        for x in xrange(30):
            # jQuery needs a small amount of time to activate. (At most 3s)
            try:
                self.driver.execute_script("jQuery('html')")
                return
            except Exception:
                time.sleep(0.1)
        # Since jQuery still isn't activating, give up and raise an exception
        raise Exception("Exception: WebDriver could not activate jQuery!")

    def scroll_to(self, selector):
        self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT)
        scroll_script = "jQuery('%s')[0].scrollIntoView()" % selector
        try:
            self.driver.execute_script(scroll_script)
        except Exception:
            # The likely reason this fails is because: "jQuery is not defined"
            self.activate_jquery()  # It's a good thing we can define it here
            self.driver.execute_script(scroll_script)
        self._demo_mode_pause_if_active(tiny=True)

    def scroll_click(self, selector):
        self.scroll_to(selector)
        self.click(selector)

    def jquery_click(self, selector):
        self.scroll_to(selector)
        self.driver.execute_script("jQuery('%s').click()" % selector)
        self._demo_mode_pause_if_active()

    def jq_format(self, code):
        return page_utils.jq_format(code)

    def set_value(self, selector, value):
        self.scroll_to(selector)
        val = json.dumps(value)
        self.driver.execute_script("jQuery('%s').val(%s)" % (selector, val))
        self._demo_mode_pause_if_active()

    def jquery_update_text_value(self, selector, new_value,
                                 timeout=settings.SMALL_TIMEOUT):
        element = self.wait_for_element_visible(selector, timeout=timeout)
        self.scroll_to(selector)
        self.driver.execute_script("""jQuery('%s').val('%s')"""
                                   % (selector, self.jq_format(new_value)))
        if new_value.endswith('\n'):
            element.send_keys('\n')
        self._demo_mode_pause_if_active()

    def jquery_update_text(self, selector, new_value,
                           timeout=settings.SMALL_TIMEOUT):
        self.jquery_update_text_value(selector, new_value, timeout=timeout)

    def hover_on_element(self, selector):
        self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT)
        self.scroll_to(selector)
        time.sleep(0.05)  # Settle down from scrolling before hovering
        return page_actions.hover_on_element(self.driver, selector)

    def hover_and_click(self, hover_selector, click_selector,
                        click_by=By.CSS_SELECTOR,
                        timeout=settings.SMALL_TIMEOUT):
        self.wait_for_element_visible(hover_selector, timeout=timeout)
        self.scroll_to(hover_selector)
        # Settle down from the scrolling before hovering
        element = page_actions.hover_and_click(
            self.driver, hover_selector, click_selector, click_by, timeout)
        self._demo_mode_pause_if_active()
        return element

    def wait_for_element_present(self, selector, by=By.CSS_SELECTOR,
                                 timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_element_present(
            self.driver, selector, by, timeout)

    def wait_for_element_visible(self, selector, by=By.CSS_SELECTOR,
                                 timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_element_visible(
            self.driver, selector, by, timeout)

    def wait_for_text_visible(self, text, selector, by=By.CSS_SELECTOR,
                              timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_text_visible(
            self.driver, text, selector, by, timeout)

    def wait_for_link_text_visible(self, link_text,
                                   timeout=settings.LARGE_TIMEOUT):
        return self.wait_for_element_visible(
            link_text, by=By.LINK_TEXT, timeout=timeout)

    def wait_for_element_absent(self, selector, by=By.CSS_SELECTOR,
                                timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_element_absent(
            self.driver, selector, by, timeout)

    def wait_for_element_not_visible(self, selector, by=By.CSS_SELECTOR,
                                     timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_element_not_visible(
            self.driver, selector, by, timeout)

    def wait_for_ready_state_complete(self, timeout=settings.EXTREME_TIMEOUT):
        return page_actions.wait_for_ready_state_complete(self.driver, timeout)

    def wait_for_and_accept_alert(self, timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_and_accept_alert(self.driver, timeout)

    def wait_for_and_dismiss_alert(self, timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_and_dismiss_alert(self.driver, timeout)

    def wait_for_and_switch_to_alert(self, timeout=settings.LARGE_TIMEOUT):
        return page_actions.wait_for_and_switch_to_alert(self.driver, timeout)

    def save_screenshot(self, name, folder=None):
        return page_actions.save_screenshot(self.driver, name, folder)

    def _demo_mode_pause_if_active(self, tiny=False):
        if self.demo_mode:
            if self.demo_sleep:
                wait_time = float(self.demo_sleep)
            else:
                wait_time = settings.DEFAULT_DEMO_MODE_TIMEOUT
            if not tiny:
                time.sleep(wait_time)
            else:
                time.sleep(wait_time/3.0)

    def _demo_mode_scroll_if_active(self, selector, by):
        if self.demo_mode:
            if by == By.CSS_SELECTOR:
                self.scroll_to(selector)


# PyTest-Specific Code #

    def setUp(self):
        """
        pytest-specific code
        Be careful if a subclass of BaseCase overrides setUp()
        You'll need to add the following line to the subclass setUp() method:
        super(SubClassOfBaseCase, self).setUp()
        """
        self.is_pytest = None
        try:
            # This raises an exception if the test is not coming from pytest
            self.is_pytest = pytest.config.option.is_pytest
        except Exception:
            # Not using pytest (probably nosetests)
            self.is_pytest = False
        if self.is_pytest:
            self.with_selenium = pytest.config.option.with_selenium
            self.headless = pytest.config.option.headless
            self.headless_active = False
            self.with_testing_base = pytest.config.option.with_testing_base
            self.log_path = pytest.config.option.log_path
            self.browser = pytest.config.option.browser
            self.data = pytest.config.option.data
            self.demo_mode = pytest.config.option.demo_mode
            self.demo_sleep = pytest.config.option.demo_sleep
            if self.headless:
                self.display = Display(visible=0, size=(1200, 800))
                self.display.start()
                self.headless_active = True
            if self.with_selenium:
                self.driver = browser_launcher.get_driver(self.browser)

    def tearDown(self):
        """
        pytest-specific code
        Be careful if a subclass of BaseCase overrides setUp()
        You'll need to add the following line to the subclass's tearDown():
        super(SubClassOfBaseCase, self).tearDown()
        """
        if self.is_pytest:
            if self.with_selenium:
                # Save a screenshot if logging is on when an exception occurs
                if self.with_testing_base and (sys.exc_info()[1] is not None):
                    test_id = "%s.%s.%s" % (self.__class__.__module__,
                                            self.__class__.__name__,
                                            self._testMethodName)
                    test_logpath = self.log_path + "/" + test_id
                    if not os.path.exists(test_logpath):
                        os.makedirs(test_logpath)
                    # Handle screenshot logging
                    log_helper.log_screenshot(test_logpath, self.driver)
                    # Handle basic test info logging
                    log_helper.log_test_failure_data(
                        test_logpath, self.driver, self.browser)
                    # Handle page source logging
                    log_helper.log_page_source(test_logpath, self.driver)
                # Finally close the browser
                self.driver.quit()
            if self.headless:
                if self.headless_active:
                    self.display.stop()
예제 #47
0
"""pecli command line interface"""

import sys
# import datetime
from getpass import getpass
from tabulate import tabulate
import click
import inquirer

import botcore

if sys.platform == "linux" or sys.platform == "linux2":
    # headless executable on Ubuntu
    from pyvirtualdisplay import Display
    DISPLAY = Display(visible=0, size=(800, 600))
    DISPLAY.start()
elif sys.platform == "darwin":
    pass
elif sys.platform == "win32":
    raise "I don't give a shit to Windows system."


@click.group()
def cli():
    """pecli command line interface"""
    pass


@cli.command()
def register():
    """Register class"""
예제 #48
0
def start_pyvirtualdisplay(rows=1400, columns=900):
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(rows, columns))
    display.start()
예제 #49
0
    def parse(self, response):
        socket.setdefaulttimeout(int(self.timeout))

        # temporary file for the output image
        t_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
        t_file.close()
        print('Created temporary image file: %s' % t_file.name)
        self.log('Created temporary image file: %s' % t_file.name)

        if not DEBUG_MODE:
            display = Display(visible=int(bool(DEBUG_MODE)),
                              size=(self.width, self.height))
            display.start()

        # we will use requesocks for checking response code
        r_session = requests.session()
        if self.timeout:
            self.timeout = int(self.timeout)
        r_session.timeout = self.timeout
        # Proxies activated again because of walmart bans
        if self.proxy:
            r_session.proxies = {"http": "{}://{}".format(self.proxy_type, self.proxy), \
                            "https": "{}://{}".format(self.proxy_type, self.proxy)}

        if self.user_agent:
            r_session.headers = {'User-Agent': self.user_agent}

        # check if the page returns code != 200
        if self.code_200_required and str(
                self.code_200_required).lower() not in ('0', 'false', 'off'):
            page_code = r_session.get(self.product_url,
                                      verify=False).status_code
            if page_code != 200:
                self.log(
                    'Page returned code %s at %s' %
                    (page_code, self.product_url), ERROR)
                yield ScreenshotItem()  # return empty item
                if not DEBUG_MODE:
                    display.stop()
                return

        driver = self.init_driver()
        item = ScreenshotItem()

        if self.proxy:
            ip_via_proxy = URL2ScreenshotSpider._get_proxy_ip(driver)
            item['via_proxy'] = ip_via_proxy
            print 'IP via proxy:', ip_via_proxy
            self.log('IP via proxy: %s' % ip_via_proxy)

        try:
            self.prepare_driver(driver)
            self.make_screenshot(driver, t_file.name)
            self.log('Screenshot was made for file %s' % t_file.name)
        except Exception as e:
            self.log('Exception while getting response using selenium! %s' %
                     str(e))
            # lets try with another driver
            another_driver_name = self._choose_another_driver()
            try:
                if not DEBUG_MODE:
                    driver.quit()  # clean RAM
            except Exception as e:
                pass
            driver = self.init_driver(name=another_driver_name)
            self.prepare_driver(driver)
            self.make_screenshot(driver, t_file.name)
            self.log('Screenshot was made for file %s (2nd attempt)' %
                     t_file.name)
            try:
                if not DEBUG_MODE:
                    driver.quit()
            except:
                pass

        # crop the image if needed
        if self.crop_width and self.crop_height:
            self.crop_width = int(self.crop_width)
            self.crop_height = int(self.crop_height)
            from PIL import Image
            # size is width/height
            img = Image.open(t_file.name)
            box = (self.crop_left, self.crop_top,
                   self.crop_left + self.crop_width,
                   self.crop_top + self.crop_height)
            area = img.crop(box)
            area.save(t_file.name, 'png')
            self.log('Screenshot was cropped and saved to %s' % t_file.name)
            if self.image_copy:  # save a copy of the file if needed
                area.save(self.image_copy, 'png')

        with open(t_file.name, 'rb') as fh:
            img_content = fh.read()
            self.log('Screenshot content was read, size: %s bytes' %
                     len(img_content))

        if self.remove_img is True:
            os.unlink(t_file.name)  # remove old output file
            self.log('Screenshot file was removed: %s' % t_file.name)

        # yield the item
        item['url'] = response.url
        item['image'] = base64.b64encode(img_content)
        item['site_settings'] = getattr(self, '_site_settings_activated_for',
                                        None)
        item['creation_datetime'] = datetime.datetime.utcnow().isoformat()

        if not DEBUG_MODE:
            display.stop()

        self.log('Item image key length: %s' % len(item.get('image', '')))

        if img_content:
            yield item
 def setUp(self):
     display = Display(visible=0, size=(1366, 768))
     display.start()
     self.driver = webdriver.Firefox()
예제 #51
0
def init_display(visible, size):
    display = Display(visible=visible, size=size)
    display.start()
    return display
예제 #52
0
class TflCrawler():
    def __init__(self):
        '''
            Constructor method that instantiate the TflCrawler.
        '''
        self.__site = 'http://cycling.data.tfl.gov.uk/'
        self.__elements = {} # initialise an empty dictionary
        self._file_type = 'CSV file'
        self.__folder_dir = os.path.abspath(os.path.dirname(__file__))

    def _start_crawling(self, driver_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'chromedriver'))):
        '''
            Start crawling process, creating and invisible browser display, with 800 by 600 dimension. Additionally,
            the location of Chrome driver is specified.

        :param driver_dir: defines the location of Chrome driver. The directory of the driver is specified as a relative
                path of the user working directory.
        '''
        try:
            print('start driver...')
            self._display = Display( visible= 0, size = (800,600)) # create a chrome display with 800*600 dimension
            self._display.start() # starts the browser
            self._driver = webdriver.Chrome(driver_dir) # set the location of web driver
        except Exception as e:
            print(f'[No driver was identified. Identified files: {os.listdir(driver_dir)}]')

    def _stop_crawling(self):
        '''
            It closes the browser display that was initialised by the start_crawling method. The driver it also stops.
        '''
        print('closing driver...')
        self._display.stop()
        self._driver.quit()

    def _get_site(self, url):
        '''
            The current method performs a request on http://cycling.data.tfl.gov.uk/ server and gets a response. The content
            of the response is converted in HTML and is returned by the method.
        :param url: the url of http://cycling.data.tfl.gov.uk/'
        '''

        try:
            self._driver.get(url) # navigates to page
            sleep(5)  # stops the code execution so that the HTML content to be loaded (5 to 10 seconds)
            return self._driver.execute_script('return document.body.innerHTML') # load the HTML content
        except Exception as e:
            print(f'[Unable to reach {self.__site}. Error : {str(e)}]')

    def _populate_dictionary(self, html):
        '''
            The HTML structure that has been retrieved by __get_site method is analysed such that a dictionary of all
            csv files within the website is constructed. The dictionary is populated by each csv file that may be uploaded
            on TFL website. Each csv file is encapsulated as a dictionary, containing keys such that [name, url, date, size].
        :param html: the html structure that is created by the __get_site method
        '''
        try:
            print('get the content...')
            soup = BeautifulSoup(html, 'html.parser') # creates a soup object and defines how the HTML will be parsed
            # finds all tr elements with an attribute of data-level=3
            main_content = soup.find_all('tr', attrs= { 'data-level' : '3' })

            # iterate over the tr elements
            for i,item in enumerate(main_content):
                td = item.find_all('td') # retrieves the td elements within the tr

                # checks if the type of the 4th td element is CSV
                if (td[3].string == self._file_type):
                    # Populates the dictionary
                    self.__elements[i] ={
                            'name' : td[0].a.string,
                            'url' : td[0].a['href'],
                            'date' : td[1].string,
                            'size' : td[2].string
                    }
        except Exception as e:
            print(f'[Unable to parse the content of {self.__site}. Error: {str(e)}]')

    def parse(self):
        '''
            Performs the entire process to parse the TFL website. In particular, starts the Chrome driver, waits until the site
            to load the HTML content, and therefore performs a request to the website. Then, the response is parsed, populating
            a dictionary that maintains all the csv files that might exist on that site
        :param driver_dir: defines the Google driver relative directory
        '''

        self._start_crawling(os.path.join(self.__folder_dir,'chromedriver'))
        html = self._get_site(self.__site)
        self._populate_dictionary(html)
        self._stop_crawling()

    def retrieve_csv_files(self, DNS,rel_path):
        '''
            Iterates over the constructed dictionary and retrieves each csv file that is identified. The csv files are saved
            locally. Additionally, the corresponded relations of the DB are created
        :param path:  the relative path, which determines the location that the created csv file would be stored.
        '''

        def populate_stations_pairs_relation(df):
            def insert(l):
                if len(l) > 1:
                    # adds a colon at the end of the statement
                    l[-1] = no_space_join([l[-1][:-1], ';'])
                    # joins the insert statements
                    statement = no_space_join(l)

                    # insert the query
                    execute(statement)
                    conn.commit()

            # Drops duplicate routes, that have a start-end station which already exists
            dfrout= df[['StartStation Id','EndStation Id']].drop_duplicates()
            # drop OD routes that started and ended at the same station

            dfrout = dfrout.drop(dfrout[(dfrout['StartStation Id'] == dfrout['EndStation Id'])].index)

            # Variables to avoid overheading
            execute = cur.execute
            fetchall = cur.fetchall

           # corresponds to the stations that already exists in the DB AND have a location
            execute('SELECT station_id,st_asText(location) FROM webapp_stations WHERE location IS NOT NULL')
            # gets the stations that have a location
            stations = dict([(station[0], station[1].replace('MULTIPOINT', '')) for station in fetchall()])

            # stations that in do not have a location in the database, are removed from the data frame
            sids = [s for s in stations.keys()]
            dfrout = dfrout[dfrout['StartStation Id'].isin(sids) == dfrout['EndStation Id'].isin(sids)]

            # requests the pairs of stations that exist in the database
            execute('SELECT start_station_id,end_station_id FROM webapp_stations_pairs_routes')
            pairs_dict = dict([(pair,pair) for pair in fetchall()])

            # Variables that will used to construct the request url
            #plan = '&plan='
            #plan_options = ['fastest','balanced','quietest']
            plan = '&plan=balanced'
            default_url = 'https://www.cyclestreets.net/api/journey.json?key=112d0fc4c69f3951&itinerarypoints='
            nPairs = dfrout.shape[0]
            try:
                # Variables out of the for loop
                #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES ']
                l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES ']
                comma_join = ','.join
                no_space_join = ''.join
                pipe_join = '|'.join

                for i_pair,pair in enumerate(dfrout.itertuples()):
                    # every 100 requests, stop the execution for 10 seconds (request policy)
                    if i_pair % 1000 == 0 and i_pair > 0:
                        sleep(5)
                        print(f'Pair : {i_pair+1} of {nPairs}')

                    start_station_id = int(pair[1])
                    end_station_id = int(pair[2])

                    # checks for OD pairs that do not exist in the DP (if the )
                    if (start_station_id,end_station_id) not in pairs_dict:
                        try:
                            start_coords = stations[start_station_id][1:-1].replace(' ',',')
                            end_coords = stations[end_station_id][1:-1].replace(' ',',')

                            #time,distance,coords = [],[],[]
                            #atime = time.append
                            #adistance = distance.append
                            #acoords = coords.append
                            #for option in plan_options:
                            # request the link from www.cyclestreet.com
                            response = requests.get(no_space_join([default_url, pipe_join([start_coords,end_coords]), plan])).json()['marker'][0]['@attributes']
                            # loads the json file into a python object(dictionary)
                            time = response['time']
                            distance = response['length']
                            coords = f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)"
                            #response_json = loads(response)['marker'][0]['@attributes']
                            #atime(response['time'])
                            #adistance(response['length'])
                            #acoords(f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)")

                        except (KeyError,AttributeError):
                            continue

                        # creates a statement of the current pair
                        #statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance[0],time[0],coords[0],distance[1],time[1],coords[1],distance[2],time[2],coords[2]]),'),'])
                        statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance,time,coords]),'),'])
                        l.append(statement)

                    if i_pair % 100 == 0:
                        insert(l)
                        l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES ']
                        #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES ']

            except Exception as e:
                print('Error while data of webapp_stations_ref_routes were requested...')
            try:
                insert(l)
                return stations
            except:
                print('Error while the INSERT statement was executed for the webapp_stations_ref_routes relation')

        def insert_values_db(values, table_attributes,relation,null_stations):

            # Local Variables
            statement = [f"INSERT INTO {table_attributes} VALUES "]
            append = statement.append # its assign so that we avoid the overheating inside the loop
            replace = str.replace # its assign so that we avoid overheating inside the loop
            n = values.shape[0]-1 # number of observations

            # If the relation that is examined is the stations, receive the spatial location of each station
            if relation == 'webapp_stations':
                # stations_location =[(randint(0,89) + random() ,randint(0,89) + random()) for e in range(values.shape[0])]
                try:
                    stations_location, null_stations = get_station_location(driver_dir= os.path.join(self.__folder_dir,'chromedriver'), url ='https://api.tfl.gov.uk/swagger/ui/index.html?url=/swagger/docs/v1#!/BikePoint/BikePoint_Search' , stations =  values['StartStation Name'].values.tolist(), null_stations = null_stations)
                except Exception as e:
                    print('Error - line 228')
            elif relation == 'webapp_routes':
                stations = populate_stations_pairs_relation(values) # returns a dictionary, with all the stations that have a location
                cur.execute('SELECT id,start_station_id,end_station_id FROM webapp_stations_pairs_routes')
                pairs = dict([((pair[1],pair[2]),pair[0]) for pair in cur.fetchall()])

            # Iterate over each observation and create the corresponded INSERT statement
            for irow, row in enumerate(values.itertuples()):
                pk = row[1] # assign the value of pk to a local variable
                try:
                    if relation == 'webapp_bikes':
                            append(replace(f"({pk}),", "\\'", "''"))
                    elif relation == 'webapp_stations':
                        try:
                            append(replace(f"({pk},'{row[2]}', ST_GeomFromText('MULTIPOINT({stations_location[irow][0]} {stations_location[irow][1]})',4326)),", "\\'", "''"))
                        except:
                            continue
                    elif relation == 'webapp_routes':
                        # get only the routes that i) do not have the same starting and ending station and i) have a start or end station that contains a location in the db
                         if (row[6] != row[7]) and (row[6] in stations) and (row[7] in stations) :
                            pair_id = pairs[(row[6],row[7])]
                            append(replace(f"({pk},'{row[2]}','{row[3]}',{abs(row[4])},{row[5]},{pair_id}),", "\\'", "''"))

                except (ValueError,KeyError):
                    continue

            # Constructs the INSERT statement
            if len(statement) > 1:
                statement[-1] = ''.join([statement[-1][:-1] + ';'])
                statement = ''.join(statement)
                # INSERT the new values into the database
                sql_execute(statement)
                conn.commit() # commit the transaction
            if relation =='webapp_stations':
                return null_stations

        def populate_relation(df, df_main_all_names, relation, pk , table_attributes, null_stations):
            # Local variables
            def process_df(df, df_main_all_names,relation):
                # in order to avoid error in subsequent procedures, we need to receive the Id of the starting and ending stations
                if relation == 'webapp_stations':
                    start_stations_df = df[df_main_all_names[1]].dropna()
                    scol = start_stations_df.columns
                    end_stations_df = df[['EndStation Id','EndStation Name']].dropna()
                    end_stations_df.columns = [scol[0],scol[1]]
                    ndf = pd.concat([start_stations_df,end_stations_df], axis= 0).drop_duplicates([df_main_all_names[0]])
                else:
                    # drops the duplicates from the primary key for the webapp_routes and webapp_bikes relation
                    ndf = dataframe(df[df_main_all_names[1]]).drop_duplicates([df_main_all_names[0]]).dropna()
                return ndf

            new_values = []
            append = new_values.append
            dataframe = pd.DataFrame

            # Retrieves the csv sub-dataframe that defines a relation
            try:
                ndf = process_df(df,df_main_all_names,relation)
            except (TypeError,IndexError,KeyError):
                df.columns = ['Rental Id','Duration','Bike Id','End Date','EndStation Id','EndStation Name','Start Date','StartStation Id', 'StartStation Name']
                ndf = process_df(df,df_main_all_names,relation)


            # Dimensions of the table
            n = ndf.shape[1]
            # Performs a SELECT query that will return current values within the db
            sql_execute(f"SELECT {pk[1]} FROM {relation};")
            # identify the pk of each entity - a dictionary is used for more efficient search
            stored_pks= dict([(e[pk[0]],e[pk[0]]) for e in cur.fetchall()])

            try:
                # Look for new values
                for row in ndf.itertuples():
                    if (row[1] not in stored_pks):
                        append(row[1])

                if len(stored_pks) != 0:
                    if n == 1: # 1 Dimensional relations
                        if len(new_values) > 0:
                            insert_values_db(dataframe({f'{df_main_all_names[0]}' : new_values}), table_attributes, relation,null_stations)

                    else: # n Dimensional relations
                         if len(new_values) > 0:
                            new_values_joined = dataframe({ df_main_all_names[0]: new_values}).merge(ndf,how='left',left_on= df_main_all_names[0], right_on = df_main_all_names[0])
                            if relation == 'webapp_stations':
                                null_stations = insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations)
                                return null_stations
                            else:
                                insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations)
                else:
                    if relation == 'webapp_stations':
                        null_stations = insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation,null_stations)
                        return null_stations
                    else:
                        insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation, null_stations)

            except psycopg2.InternalError:
                conn.rollback()
                process_df(df, df_main_all_names, relation)
            except Exception as e:
                print(f'Line 327 - {e}')
        #-------------------------------------------------------------------------------------------------------------------------

        try:
            # Local Variables
            join = os.path.join
            exists = os.path.exists
            size = os.path.getsize
            cd = self.__folder_dir # gives the directory of tflcrawler
            read_csv = pd.read_csv

            # establish a connection with o PostgreSQL database, based on the given DNS parameter
            conn = psycopg2.connect(DNS)
            cur = conn.cursor() # initialise a cursor
            sql_execute = cur.execute # cur.execute command is assigned as local variable (avoid dot overheating)
            null_stations = ['Bourne Street, Belgravia'] # list that will check if a station is null

            path = join(cd,rel_path) # Defines the path where the csv files will be stored
            print('starts to retrieve the csv files...')
            elements = self.__elements # assign the current dictionary to a local variable

            # iterate over the dictionary elements
            for value in tqdm(elements.values()):
                name = value['name'] # file name
                try:
                    csv_path = join(path, name) # assign a full path fof the file
                    print(csv_path)

                    # if the file does not exist or the file exists, having a size of zero (nothing within it)
                    if (not exists(csv_path)) or (exists(csv_path) and size(csv_path) == 0):

                        # request the csv file from the server
                        try:
                            response = requests.get(value['url'])
                        except (requests.ConnectionError, requests.ConnectTimeout, requests.HTTPError, requests.TooManyRedirects) as error:
                            print(str(error))
                        # convert the text to a generator
                        splitted_text = response.iter_lines(chunk_size= 512)

                        # opens and write the file
                        with open(csv_path, 'w') as file:
                            for line in splitted_text:
                                file.write(str(line)[2:-1] + '\n')
                            file.close()

                        # reads the created csv file
                        df = read_csv(filepath_or_buffer= csv_path, delimiter=',' ,encoding= 'utf-8')
                        # Populates the Bikes entity
                        populate_relation(df = df, df_main_all_names= ('Bike Id', 'Bike Id'), relation= 'webapp_bikes' , pk = (0,'bike_id'), table_attributes= 'webapp_bikes(bike_id)', null_stations = null_stations)

                        # Populates the Stations entity
                        condition = True # initialise a boolean variable that checks if the populate_relation function of stations has been correctly executed
                        while(condition):
                            try:
                                # populate the db with the corresponded values of stations
                                null_stations = populate_relation(df = df, df_main_all_names= ('StartStation Id', ['StartStation Id', 'StartStation Name']) , relation ='webapp_stations', pk = (0,'station_id'), table_attributes= 'webapp_stations(station_id,station_name,location)', null_stations = null_stations)

                                # set the condition to false and exit from the while loop
                                condition = False
                            except ValueError:  # If the function returns an error due to unsimilarity of the file, SKIP the file
                                condition = False
                            except Exception as e: # If the function returns any other error, execute the function again
                                # The function may do not executed correctly due to problems with the connection with the API and other requests
                                print('POPULATE_RELATION IS EXECUTED AGAIN...')
                                continue
                        # Populates the Routes entity
                        populate_relation(df = df, df_main_all_names=('Rental Id', ['Rental Id','Start Date','End Date', 'Duration','Bike Id','StartStation Id', 'EndStation Id']), relation= 'webapp_routes', pk =(0,'rental_id'), table_attributes='webapp_routes(rental_id,start_date,end_date,duration,bike_id,station_pairs_id)', null_stations = null_stations)

                except Exception as e:
                    print(f'[Error of file {name} - Inside the FOR loop]')
                    continue

        except Exception as e:
            # Close the cursor and database connection as well
            cur.close()
            conn.close()
            print(f'[ Error while the files are retrieved. Error: {str(e)}]')

    @property
    def elements(self):
        return self.__elements

    @property
    def site(self):
        return self.__site
예제 #53
0
class LinkedinPy:
    """Class to be instantiated to use the script"""
    def __init__(self,
                 username=None,
                 userid=None,
                 password=None,
                 nogui=False,
                 selenium_local_session=True,
                 use_firefox=False,
                 browser_profile_path=None,
                 page_delay=25,
                 show_logs=True,
                 headless_browser=False,
                 proxy_address=None,
                 proxy_chrome_extension=None,
                 proxy_port=None,
                 disable_image_load=False,
                 bypass_suspicious_attempt=False,
                 bypass_with_mobile=False,
                 multi_logs=True):

        cli_args = parse_cli_args()
        username = cli_args.username or username
        password = cli_args.password or password
        use_firefox = cli_args.use_firefox or use_firefox
        page_delay = cli_args.page_delay or page_delay
        headless_browser = cli_args.headless_browser or headless_browser
        proxy_address = cli_args.proxy_address or proxy_address
        proxy_port = cli_args.proxy_port or proxy_port
        disable_image_load = cli_args.disable_image_load or disable_image_load
        bypass_suspicious_attempt = (cli_args.bypass_suspicious_attempt
                                     or bypass_suspicious_attempt)
        bypass_with_mobile = cli_args.bypass_with_mobile or bypass_with_mobile
        if not get_workspace(Settings):
            raise SocialPyError(
                "Oh no! I don't have a workspace to work at :'(")

        self.nogui = nogui
        if nogui:
            self.display = Display(visible=0, size=(800, 600))
            self.display.start()

        self.browser = None
        self.headless_browser = headless_browser
        self.proxy_address = proxy_address
        self.proxy_port = proxy_port
        self.proxy_chrome_extension = proxy_chrome_extension
        self.selenium_local_session = selenium_local_session
        self.bypass_suspicious_attempt = bypass_suspicious_attempt
        self.bypass_with_mobile = bypass_with_mobile
        self.disable_image_load = disable_image_load

        self.username = username or os.environ.get('LINKEDIN_USER')
        self.password = password or os.environ.get('LINKEDIN_PW')
        Settings.profile["name"] = self.username

        self.page_delay = page_delay
        self.switch_language = True
        self.use_firefox = use_firefox
        Settings.use_firefox = self.use_firefox
        self.browser_profile_path = browser_profile_path
        self.liked_img = 0
        self.already_liked = 0
        self.liked_comments = 0
        self.commented = 0
        self.replied_to_comments = 0
        self.connected = 0
        self.already_connected = 0
        self.unconnected = 0
        self.connected_by = 0
        self.connecting_num = 0
        self.inap_img = 0
        self.not_valid_users = 0
        self.connect_times = 1
        self.start_time = time.time()

        # assign logger
        self.show_logs = show_logs
        Settings.show_logs = show_logs or None
        self.multi_logs = multi_logs
        self.logfolder = get_logfolder(self.username, self.multi_logs,
                                       Settings)
        self.logger = self.get_linkedinpy_logger(self.show_logs)

        get_database(Settings,
                     make=True)  # IMPORTANT: think twice before relocating

        if self.selenium_local_session is True:
            self.set_selenium_local_session(Settings)

    def get_linkedinpy_logger(self, show_logs):
        """
        Handles the creation and retrieval of loggers to avoid
        re-instantiation.
        """

        existing_logger = Settings.loggers.get(self.username)
        if existing_logger is not None:
            return existing_logger
        else:
            # initialize and setup logging system for the LinkedinPy object
            logger = logging.getLogger(self.username)
            logger.setLevel(logging.DEBUG)
            file_handler = logging.FileHandler('{}general.log'.format(
                self.logfolder))
            file_handler.setLevel(logging.DEBUG)
            extra = {"username": self.username}
            logger_formatter = logging.Formatter(
                '%(levelname)s [%(asctime)s] [%(username)s]  %(message)s',
                datefmt='%Y-%m-%d %H:%M:%S')
            file_handler.setFormatter(logger_formatter)
            logger.addHandler(file_handler)

            if show_logs is True:
                console_handler = logging.StreamHandler()
                console_handler.setLevel(logging.DEBUG)
                console_handler.setFormatter(logger_formatter)
                logger.addHandler(console_handler)

            logger = logging.LoggerAdapter(logger, extra)

            Settings.loggers[self.username] = logger
            Settings.logger = logger
            return logger

    def set_selenium_local_session(self, Settings):
        self.browser, err_msg = \
            set_selenium_local_session(self.proxy_address,
                                       self.proxy_port,
                                       self.proxy_chrome_extension,
                                       self.headless_browser,
                                       self.use_firefox,
                                       self.browser_profile_path,
                                       # Replaces
                                       # browser User
                                       # Agent from
                                       # "HeadlessChrome".
                                       self.disable_image_load,
                                       self.page_delay,
                                       self.logger,
                                       Settings)
        if len(err_msg) > 0:
            raise SocialPyError(err_msg)

    def login(self):
        """Used to login the user either with the username and password"""
        if not login_user(self.browser, self.username, None, self.password,
                          self.logger, self.logfolder, self.switch_language,
                          self.bypass_suspicious_attempt,
                          self.bypass_with_mobile):
            message = "Wrong login data!"
            highlight_print(Settings, self.username, message, "login",
                            "critical", self.logger)

            # self.aborting = True

        else:
            message = "Logged in successfully!"
            highlight_print(Settings, self.username, message, "login", "info",
                            self.logger)
            # try to save account progress
            try:
                save_account_progress(self.browser,
                                      "https://www.linkedin.com/",
                                      self.username, self.logger)
            except Exception:
                self.logger.warning(
                    'Unable to save account progress, skipping data update')
        return self

    def withdraw_old_invitations(self, skip_pages=10, sleep_delay=6):
        page_no = skip_pages
        while page_no < 100:
            page_no = page_no + 1
            try:
                url = "https://www.linkedin.com/mynetwork/invitation-manager/sent/?page=" + str(
                    page_no)
                web_address_navigator(Settings, self.browser, url)
                print("Starting page:", page_no)
                if self.browser.current_url == "https://www.linkedin.com/mynetwork/invitation-manager/sent/" or len(
                        self.browser.find_elements_by_css_selector(
                            "li.invitation-card div.pl5")) == 0:
                    print("============Last Page Reached==============")
                    break
                checked_in_page = 0
                for i in range(
                        0,
                        len(
                            self.browser.find_elements_by_css_selector(
                                "li.invitation-card div.pl5"))):
                    try:
                        res_item = self.browser.find_elements_by_css_selector(
                            "li.invitation-card div.pl5")[i]
                        try:
                            link = res_item.find_element_by_css_selector(
                                "div > a")
                            profile_link = link.get_attribute("href")
                            user_name = profile_link.split('/')[4]
                            self.logger.info(
                                "user_name : {}".format(user_name))
                        except Exception as e:
                            print("Might be a stale profile", e)
                        time = res_item.find_element_by_css_selector(
                            "div > time")
                        self.logger.info("time : {}".format(time.text))
                        check_button = res_item.find_element_by_css_selector(
                            "div > div:nth-child(1) > input")
                        check_status = check_button.get_attribute(
                            "data-artdeco-is-focused")
                        self.logger.info(
                            "check_status : {}".format(check_status))

                        self.browser.execute_script("window.scrollTo(0, " +
                                                    str((i + 1) * 104) + ");")

                        if "month" in time.text:
                            (ActionChains(self.browser).move_to_element(
                                check_button).click().perform())
                            self.logger.info("check_button clicked")
                            checked_in_page = checked_in_page + 1
                            delay_random = random.randint(
                                ceil(sleep_delay * 0.42),
                                ceil(sleep_delay * 0.57))
                            sleep(delay_random)
                    except Exception as e:
                        self.logger.error(e)
                if checked_in_page > 0:
                    self.logger.info("Widraw to be pressed")
                    try:
                        self.browser.execute_script("window.scrollTo(0, 0);")
                        withdraw_button = self.browser.find_element_by_css_selector(
                            "ul > li.mn-list-toolbar__right-button > button")
                        self.logger.info("withdraw_button : {}".format(
                            withdraw_button.text))
                        if "Withdraw" in withdraw_button.text:
                            (ActionChains(self.browser).move_to_element(
                                withdraw_button).click().perform())
                            self.logger.info("withdraw_button clicked")
                            page_no = page_no - 1
                            delay_random = random.randint(
                                ceil(sleep_delay * 0.85),
                                ceil(sleep_delay * 1.14))
                            sleep(delay_random)
                    except Exception as e:
                        print(
                            "For some reason there is no withdraw_button inspite of checkings",
                            e)
                else:
                    self.logger.info("Nothing checked in this page")
            except Exception as e:
                self.logger.error(e)
            self.logger.info("============Next Page==============")

    def search_1stconnects_and_savetodb(self,
                                        query,
                                        city_code,
                                        school_code=None,
                                        past_company=None,
                                        random_start=True,
                                        max_pages=10,
                                        max_connects=25,
                                        sleep_delay=6):
        """ search linkedin and connect from a given profile """

        self.logger.info(
            "Searching for: query={}, city_code={}, school_code={}".format(
                query, city_code, school_code))
        search_url = "https://www.linkedin.com/search/results/people/?&facetNetwork=%5B%22F%22%5D"
        if city_code:
            search_url = search_url + "&facetGeoRegion=" + city_code
        if school_code:
            search_url = search_url + "&facetSchool=" + school_code
        if past_company:
            search_url = search_url + "&facetPastCompany=" + past_company

        search_url = search_url + "&keywords=" + query
        search_url = search_url + "&origin=" + "FACETED_SEARCH"

        for page_no in range(1, 101):
            try:
                temp_search_url = search_url + "&page=" + str(page_no)
                web_address_navigator(Settings, self.browser, temp_search_url)
                self.logger.info("Starting page: {}".format(page_no))

                for jc in range(2, 11):
                    sleep(1)
                    self.browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight/" +
                        str(jc) + ");")

                if len(
                        self.browser.find_elements_by_css_selector(
                            "div.search-result__wrapper")) == 0:
                    self.logger.info(
                        "============Last Page Reached or asking for Premium membership=============="
                    )
                    break
                for i in range(
                        0,
                        len(
                            self.browser.find_elements_by_css_selector(
                                "div.search-result__wrapper"))):
                    try:
                        res_item = self.browser.find_elements_by_css_selector(
                            "li.search-result div.search-entity div.search-result__wrapper"
                        )[i]
                        link = res_item.find_element_by_css_selector("div > a")
                        profile_link = link.get_attribute("href")
                        user_name = profile_link.split('/')[4]
                        self.logger.info("user_name : {}".format(user_name))
                        msg_button = res_item.find_element_by_xpath(
                            "//div[3]/div/div/button[text()='Message']")
                        print(msg_button.text, "present")
                        if msg_button.text == "Message":
                            connect_restriction("write", user_name, None,
                                                self.logger)
                            self.logger.info(
                                "saved {} to db".format(user_name))
                    except Exception as e:
                        self.logger.error(e)
            except Exception as e:
                self.logger.error(e)
            self.logger.info("============Next Page==============")

    def test_page(self, search_url, page_no, css_selector_identifier):
        web_address_navigator(Settings, self.browser, search_url)
        self.logger.info("Testing page: {}".format(page_no))
        if len(
                self.browser.find_elements_by_css_selector(
                    css_selector_identifier)) > 0:
            return True
        return False

    def search_and_connect(self,
                           query,
                           connection_relationship_code,
                           city_code,
                           school_code=None,
                           past_company=None,
                           random_start=True,
                           max_pages=10,
                           max_connects=25,
                           sleep_delay=6):
        """ search linkedin and connect from a given profile """

        if quota_supervisor(Settings, "connects") == "jump":
            return 0

        self.logger.info(
            "Searching for: query={}, connection_relationship_code={}, city_code={}, school_code={}"
            .format(query, connection_relationship_code, city_code,
                    school_code))
        connects = 0
        prev_connects = -1
        search_url = "https://www.linkedin.com/search/results/people/?"
        if connection_relationship_code:
            search_url = search_url + "&facetNetwork=" + connection_relationship_code
        if city_code:
            search_url = search_url + "&facetGeoRegion=" + city_code
        if school_code:
            search_url = search_url + "&facetSchool=" + school_code
        if past_company:
            search_url = search_url + "&facetPastCompany=" + past_company

        search_url = search_url + "&keywords=" + query
        search_url = search_url + "&origin=" + "FACETED_SEARCH"

        temp_search_url = search_url + "&page=1"
        print(temp_search_url)
        time.sleep(10)
        if self.test_page(
                search_url=temp_search_url,
                page_no=1,
                css_selector_identifier="div.search-result__wrapper") == False:
            self.logger.info(
                "============Definitely no Result, Next Query==============")
            return 0

        if random_start:
            trial = 0
            st = 5
            while True and trial < 5 and st > 1:
                st = random.randint(1, st - 1)
                temp_search_url = search_url + "&page=" + str(st)
                if self.test_page(temp_search_url, st,
                                  "div.search-result__wrapper"):
                    break
                trial = trial + 1
        else:
            st = 1

        for page_no in list(range(st, st + max_pages)):

            if prev_connects == connects:
                self.logger.info(
                    "============Limits might have exceeded or all Invites pending from this page(let's exit either case)=============="
                )
                break
            else:
                prev_connects = connects

            try:
                temp_search_url = search_url + "&page=" + str(page_no)
                if page_no > st and st > 1:
                    web_address_navigator(Settings, self.browser,
                                          temp_search_url)
                self.logger.info("Starting page: {}".format(page_no))

                for jc in range(2, 11):
                    sleep(1)
                    self.browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight/" +
                        str(jc) + "-100);")

                if len(
                        self.browser.find_elements_by_css_selector(
                            "div.search-result__wrapper")) == 0:
                    self.logger.info(
                        "============Last Page Reached or asking for Premium membership=============="
                    )
                    break
                for i in range(
                        0,
                        len(
                            self.browser.find_elements_by_css_selector(
                                "div.search-result__wrapper"))):
                    try:
                        res_item = self.browser.find_elements_by_css_selector(
                            "li.search-result div.search-entity div.search-result__wrapper"
                        )[i]  # div.search-result__actions div button")
                        # pp.pprint(res_item.get_attribute('innerHTML'))
                        link = res_item.find_element_by_css_selector("div > a")
                        profile_link = link.get_attribute("href")
                        self.logger.info("Profile : {}".format(profile_link))
                        user_name = profile_link.split('/')[4]
                        # self.logger.info("user_name : {}".format(user_name))
                        name = res_item.find_element_by_css_selector(
                            "h3 > span > span > span")  #//span/span/span[1]")
                        self.logger.info("Name : {}".format(name.text))

                        if connect_restriction("read", user_name,
                                               self.connect_times,
                                               self.logger):
                            self.logger.info("already connected")
                            continue

                        try:
                            connect_button = res_item.find_element_by_xpath(
                                "//div[3]/div/button[text()='Connect']")
                            self.logger.info(
                                "Connect button found, connecting...")
                            self.browser.execute_script(
                                "var evt = document.createEvent('MouseEvents');"
                                +
                                "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);"
                                + "arguments[0].dispatchEvent(evt);",
                                res_item.find_element_by_xpath(
                                    '//div[3]/div/button[text()="Connect"]'))
                            self.logger.info("Clicked {}".format(
                                connect_button.text))
                            sleep(2)
                        except Exception:
                            invite_sent_button = res_item.find_element_by_xpath(
                                "//div[3]/div/button[text()='Invite Sent']")
                            self.logger.info("Already {}".format(
                                invite_sent_button.text))
                            continue

                        try:
                            modal = self.browser.find_element_by_css_selector(
                                "div.modal-wormhole-content > div")
                            if modal:
                                try:
                                    sendnow_or_done_button = modal.find_element_by_xpath(
                                        "//div[1]/div/section/div/div[2]/button[2]"
                                    )  #text()='Send now']")
                                    self.logger.info(
                                        sendnow_or_done_button.text)
                                    if not (sendnow_or_done_button.text
                                            == 'Done'
                                            or sendnow_or_done_button.text
                                            == 'Send now'):
                                        raise Exception(
                                            "Send Now or Done button not found"
                                        )
                                    if sendnow_or_done_button.is_enabled():
                                        (ActionChains(
                                            self.browser).move_to_element(
                                                sendnow_or_done_button).click(
                                                ).perform())
                                        self.logger.info("Clicked {}".format(
                                            sendnow_or_done_button.text))
                                        connects = connects + 1
                                        connect_restriction(
                                            "write", user_name, None,
                                            self.logger)
                                        try:
                                            # update server calls
                                            update_activity(
                                                Settings, 'connects')
                                        except Exception as e:
                                            self.logger.error(e)
                                        sleep(2)
                                    else:
                                        try:
                                            #TODO: input("find correct close XPATH")
                                            close_button = modal.find_element_by_xpath(
                                                "//div[1]/div/section/div/header/button"
                                            )
                                            (ActionChains(
                                                self.browser).move_to_element(
                                                    close_button).click().
                                             perform())
                                            print(sendnow_or_done_button.text,
                                                  "disabled, clicked close")
                                            sleep(2)
                                        except Exception as e:
                                            print(
                                                "close_button not found, Failed with:",
                                                e)
                                except Exception as e:
                                    print(
                                        "sendnow_or_done_button not found, Failed with:",
                                        e)
                            else:
                                self.logger.info("Popup not found")
                        except Exception as e:
                            print("Popup not found, Failed with:", e)
                            try:
                                new_popup_buttons = self.browser.find_elements_by_css_selector(
                                    "#artdeco-modal-outlet div.artdeco-modal-overlay div.artdeco-modal div.artdeco-modal__actionbar button.artdeco-button"
                                )
                                gotit_button = new_popup_buttons[1]
                                (ActionChains(self.browser).move_to_element(
                                    gotit_button).click().perform())
                                print(gotit_button.text, " clicked")
                                sleep(2)
                            except Exception as e:
                                print("New Popup also not found, Failed with:",
                                      e)

                        self.logger.info(
                            "Connects sent in this iteration: {}".format(
                                connects))
                        delay_random = random.randint(ceil(sleep_delay * 0.85),
                                                      ceil(sleep_delay * 1.14))
                        sleep(delay_random)
                        if connects >= max_connects:
                            self.logger.info(
                                "max_connects({}) for this iteration reached , Returning..."
                                .format(max_connects))
                            return
                    except Exception as e:
                        self.logger.error(e)
            except Exception as e:
                self.logger.error(e)
            self.logger.info("============Next Page==============")
            return connects

    def endorse(self, profile_link, sleep_delay):
        try:
            web_address_navigator(Settings, self.browser, profile_link)

            for jc in range(1, 10):
                sleep(1)
                self.browser.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight*" +
                    str(jc) + "/10);")

            skills_pane = self.browser.find_element_by_css_selector(
                "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section"
            )
            if (skills_pane.text.split('\n')[0] == 'Skills & Endorsements'):
                try:
                    first_skill_button_icon = self.browser.find_element_by_css_selector(
                        "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button > li-icon"
                    )
                    button_type = first_skill_button_icon.get_attribute("type")
                    if button_type == 'plus-icon':
                        first_skill_button = self.browser.find_element_by_css_selector(
                            "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button"
                        )
                        self.browser.execute_script(
                            "var evt = document.createEvent('MouseEvents');" +
                            "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);"
                            + "arguments[0].dispatchEvent(evt);",
                            first_skill_button)
                        first_skill_title = self.browser.find_element_by_css_selector(
                            "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > p > a > span"
                        )
                        print(first_skill_title.text, "clicked")
                        delay_random = random.randint(ceil(sleep_delay * 0.85),
                                                      ceil(sleep_delay * 1.14))
                        sleep(delay_random)
                    else:
                        self.logger.info(
                            'button_type already {}'.format(button_type))
                except Exception as e:
                    self.logger.error(e)
            else:
                self.logger.info('Skill & Endorsements pane not found')
        except Exception as e:
            self.logger.error(e)

    def search_and_endorse(self,
                           query,
                           city_code,
                           school_code,
                           random_start=True,
                           max_pages=3,
                           max_endorsements=25,
                           sleep_delay=6):
        """ search linkedin and endose few first connections """

        if quota_supervisor(Settings, "connects") == "jump":
            return  #False, "jumped"

        print("Searching for: ", query, city_code, school_code)
        search_url = "https://www.linkedin.com/search/results/people/?"
        if city_code:
            search_url = search_url + "&facetGeoRegion=" + city_code
        if school_code:
            search_url = search_url + "&facetSchool=" + school_code

        search_url = search_url + "&facetNetwork=%5B%22F%22%5D"
        search_url = search_url + "&keywords=" + query
        search_url = search_url + "&origin=" + "FACETED_SEARCH"

        if random_start:
            trial = 0
            while True and trial < 3:
                st = random.randint(1, 3)
                temp_search_url = search_url + "&page=" + str(st)
                web_address_navigator(Settings, self.browser, temp_search_url)
                self.logger.info("Testing page:".format(st))
                result_items = self.browser.find_elements_by_css_selector(
                    "div.search-result__wrapper")
                if len(result_items) > 0:
                    break
                trial = trial + 1
        else:
            st = 1

        connects = 0
        for page_no in list(range(st, st + 1)):
            collected_profile_links = []
            try:
                temp_search_url = search_url + "&page=" + str(page_no)
                if page_no > st and st > 1:
                    web_address_navigator(Settings, self.browser,
                                          temp_search_url)
                self.logger.info("Starting page: {}".format(page_no))

                for jc in range(2, 11):
                    sleep(1)
                    self.browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight/" +
                        str(jc) + "-100);")

                result_items = self.browser.find_elements_by_css_selector(
                    "div.search-result__wrapper")

                # print(result_items)
                for result_item in result_items:
                    try:
                        link = result_item.find_element_by_css_selector(
                            "div > a")
                        self.logger.info("Profile : {}".format(
                            link.get_attribute("href")))
                        collected_profile_links.append(
                            link.get_attribute("href"))
                        name = result_item.find_element_by_css_selector(
                            "h3 > span > span > span")
                        self.logger.info("Name : {}".format(name.text))
                    except Exception as e:
                        self.logger.error(e)
            except Exception as e:
                self.logger.error(e)

            for collected_profile_link in collected_profile_links:
                self.endorse(collected_profile_link, sleep_delay=sleep_delay)
                connects = connects + 1
                if connects >= max_endorsements:
                    self.logger.info(
                        "max_endorsements({}) for this iteration reached , Returning..."
                        .format(max_endorsements))
                    return

            self.logger.info("============Next Page==============")

    def dump_connect_restriction(self, profile_name, logger, logfolder):
        """ Dump connect restriction data to a local human-readable JSON """

        try:
            # get a DB and start a connection
            db, id = get_database(Settings)
            conn = sqlite3.connect(db)

            with conn:
                conn.row_factory = sqlite3.Row
                cur = conn.cursor()

                cur.execute(
                    "SELECT * FROM connectRestriction WHERE profile_id=:var",
                    {"var": id})
                data = cur.fetchall()

            if data:
                # get the existing data
                filename = "{}connectRestriction.json".format(logfolder)
                if os.path.isfile(filename):
                    with open(filename) as connectResFile:
                        current_data = json.load(connectResFile)
                else:
                    current_data = {}

                # pack the new data
                connect_data = {
                    user_data[1]: user_data[2]
                    for user_data in data or []
                }
                current_data[profile_name] = connect_data

                # dump the fresh connect data to a local human readable JSON
                with open(filename, 'w') as connectResFile:
                    json.dump(current_data, connectResFile)

        except Exception as exc:
            logger.error(
                "Pow! Error occurred while dumping connect restriction data to a "
                "local JSON:\n\t{}".format(str(exc).encode("utf-8")))

        finally:
            if conn:
                # close the open connection
                conn.close()

    def end(self):
        """Closes the current session"""

        # IS_RUNNING = False
        close_browser(self.browser, False, self.logger)

        with interruption_handler():
            # close virtual display
            if self.nogui:
                self.display.stop()

            # write useful information
            self.dump_connect_restriction(self.username, self.logger,
                                          self.logfolder)
            # dump_record_activity(self.username,
            #                      self.logger,
            #                      self.logfolder,
            #                      Settings)

            with open('{}connected.txt'.format(self.logfolder), 'w') \
                    as connectFile:
                connectFile.write(str(self.connected))

            # output live stats before leaving
            self.live_report()

            message = "Session ended!"
            highlight_print(Settings, self.username, message, "end", "info",
                            self.logger)
            print("\n\n")

    def set_quota_supervisor(self,
                             Settings,
                             enabled=False,
                             sleep_after=[],
                             sleepyhead=False,
                             stochastic_flow=False,
                             notify_me=False,
                             peak_likes=(None, None),
                             peak_comments=(None, None),
                             peak_connects=(None, None),
                             peak_unconnects=(None, None),
                             peak_server_calls=(None, None)):
        """
         Sets aside QS configuration ANY time in a session
        """
        # take a reference of the global configuration
        configuration = Settings.QS_config

        # strong type checking on peaks entered
        peak_values_combined = [
            peak_likes, peak_comments, peak_connects, peak_unconnects,
            peak_server_calls
        ]
        peaks_are_tuple = all(
            type(item) is tuple for item in peak_values_combined)

        if peaks_are_tuple:
            peak_values_merged = [
                i for sub in peak_values_combined for i in sub
            ]
            integers_filtered = filter(lambda e: isinstance(e, int),
                                       peak_values_merged)

            peaks_are_provided = all(
                len(item) == 2 for item in peak_values_combined)
            peaks_are_valid = all(
                type(item) is int or type(item) is type(None)
                for item in peak_values_merged)
            peaks_are_good = all(item >= 0 for item in integers_filtered)

        # set QS if peak values are eligible
        if (peaks_are_tuple and peaks_are_provided and peaks_are_valid
                and peaks_are_good):

            peaks = {
                "likes": {
                    "hourly": peak_likes[0],
                    "daily": peak_likes[1]
                },
                "comments": {
                    "hourly": peak_comments[0],
                    "daily": peak_comments[1]
                },
                "connects": {
                    "hourly": peak_connects[0],
                    "daily": peak_connects[1]
                },
                "unconnects": {
                    "hourly": peak_unconnects[0],
                    "daily": peak_unconnects[1]
                },
                "server_calls": {
                    "hourly": peak_server_calls[0],
                    "daily": peak_server_calls[1]
                }
            }

            if not isinstance(sleep_after, list):
                sleep_after = [sleep_after]

            rt = time.time()
            latesttime = {"hourly": rt, "daily": rt}
            orig_peaks = deepcopy(peaks)  # original peaks always remain static
            stochasticity = {
                "enabled": stochastic_flow,
                "latesttime": latesttime,
                "original_peaks": orig_peaks
            }

            if (platform.startswith("win32") and python_version() < "2.7.15"):
                # UPDATE ME: remove this block once plyer is
                # verified to work on [very] old versions of Python 2
                notify_me = False

            # update QS configuration with the fresh settings
            configuration.update({
                "state": enabled,
                "sleep_after": sleep_after,
                "sleepyhead": sleepyhead,
                "stochasticity": stochasticity,
                "notify": notify_me,
                "peaks": peaks
            })

        else:
            # turn off QS for the rest of the session
            # since peak values are ineligible
            configuration.update(state="False")

            # user should be warned only if has had QS turned on
            if enabled is True:
                self.logger.warning("Quota Supervisor: peak rates are misfit! "
                                    "Please use supported formats."
                                    "\t~disabled QS")

    def live_report(self):
        """ Report live sessional statistics """

        print('')

        stats = [
            self.liked_img, self.already_liked, self.commented, self.connected,
            self.already_connected, self.unconnected, self.inap_img,
            self.not_valid_users
        ]

        if self.connecting_num and self.connected_by:
            owner_relationship_info = (
                "On session start was connectING {} users"
                " & had {} connectERS".format(self.connecting_num,
                                              self.connected_by))
        else:
            owner_relationship_info = ''

        sessional_run_time = self.run_time()
        run_time_info = (
            "{} seconds".format(sessional_run_time) if sessional_run_time < 60
            else "{} minutes".format(truncate_float(
                sessional_run_time / 60, 2)) if sessional_run_time < 3600 else
            "{} hours".format(truncate_float(sessional_run_time / 60 / 60, 2)))
        run_time_msg = "[Session lasted {}]".format(run_time_info)

        if any(stat for stat in stats):
            self.logger.info(
                "Sessional Live Report:\n"
                "\t|> LIKED {} images  |  ALREADY LIKED: {}\n"
                "\t|> COMMENTED on {} images\n"
                "\t|> connected {} users  |  ALREADY connected: {}\n"
                "\t|> UNconnected {} users\n"
                "\t|> LIKED {} comments\n"
                "\t|> REPLIED to {} comments\n"
                "\t|> INAPPROPRIATE images: {}\n"
                "\t|> NOT VALID users: {}\n"
                "\n{}\n{}".format(self.liked_img, self.already_liked,
                                  self.commented, self.connected,
                                  self.already_connected, self.unconnected,
                                  self.liked_comments,
                                  self.replied_to_comments, self.inap_img,
                                  self.not_valid_users,
                                  owner_relationship_info, run_time_msg))
        else:
            self.logger.info("Sessional Live Report:\n"
                             "\t|> No any statistics to show\n"
                             "\n{}\n{}".format(owner_relationship_info,
                                               run_time_msg))

    def run_time(self):
        """ Get the time session lasted in seconds """

        real_time = time.time()
        run_time = (real_time - self.start_time)
        run_time = truncate_float(run_time, 2)

        return run_time

    def search_and_apply(self):
        usualjobslink = "https://www.linkedin.com/jobs"
        web_address_navigator(Settings, self.browser, usualjobslink)

        job_title_XP = '//input[contains(@id,"jobs-search-box-keyword-id")]'
        txt_job_title = self.browser.find_element_by_xpath(job_title_XP)
        print('Entering Job Title')
        (ActionChains(self.browser).move_to_element(
            txt_job_title).click().send_keys("Python Developer").perform())

        job_location_XP = '//input[contains(@id,"jobs-search-box-location-id")]'
        txt_job_location = self.browser.find_element_by_xpath(job_location_XP)
        print('Entering Job Location')
        (ActionChains(
            self.browser).move_to_element(txt_job_location).click().send_keys(
                "San Jose, California, United States").perform())

        # update server calls for both 'click' and 'send_keys' actions
        for i in range(2):
            update_activity(Settings)

        sleep(1)
        print("Clicking Search Button")
        job_search_XP = '//button[contains(@class,"jobs-search-box__submit-button")]'
        btn_job_search = self.browser.find_element_by_xpath(job_search_XP)
        print(btn_job_search)
        (ActionChains(
            self.browser).move_to_element(btn_job_search).click().perform())

        # update server calls
        update_activity(Settings)

        sleep(10)
        input("Press Enter to continue...")

    def search_and_apply(self,
                         job_title,
                         job_location,
                         distance=50,
                         random_start=True,
                         max_pages=20,
                         max_connects=25,
                         sleep_delay=6):

        self.logger.info(
            "Searching for: job_title={}, job_location={}, radius={}".format(
                job_title, job_location, distance))
        connects = 0
        prev_connects = -1
        # https://www.linkedin.com/jobs/search/?keywords=python%20developer&location=San%20Jose%2C%20California%2C%20United%20States&distance=50
        job_search_url = "https://www.linkedin.com/jobs/search/?"
        if job_title:
            job_search_url = job_search_url + "keywords=" + job_title
        if job_location:
            job_search_url = job_search_url + "&location=" + job_location
        if distance:
            job_search_url = job_search_url + "&distance=" + str(distance)

        temp_job_search_url = job_search_url + "&start=0"
        print(temp_job_search_url)
        time.sleep(10)
        if self.test_page(
                search_url=temp_job_search_url,
                page_no=1,
                css_selector_identifier="div.jobs-search-results ") == False:
            self.logger.info(
                "============Definitely no Result, Next Query==============")
            return 0
        if random_start:

            trial = 0
            st = 5
            while True and trial < 5 and st > 1:
                st = random.randint(1, st - 1)
                temp_job_search_url = job_search_url + "&start=" + str(st * 25)
                if self.test_page(temp_job_search_url, st,
                                  "div.jobs-search-results"):
                    break
                trial = trial + 1
        else:
            st = 1
        for page_no in list(range(st, st + max_pages)):
            try:
                temp_job_search_url = job_search_url + "&start=" + str(page_no)
                if page_no > st and st > 1:
                    web_address_navigator(Settings, self.browser,
                                          temp_job_search_url)
                self.logger.info("Starting page: {}".format(page_no))

                for jc in range(2, 11):
                    sleep(1)
                    self.browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight/" +
                        str(jc) + "-100);")
                if len(
                        self.browser.find_elements_by_css_selector(
                            "div.jobs-search-results")) == 0:
                    self.logger.info(
                        "============Last Page Reached or asking for Premium membership=============="
                    )
                    break
                for i in range(
                        0,
                        len(
                            self.browser.find_elements_by_css_selector(
                                "div.jobs-search-results"))):
                    print(i)
            except Exception as e:
                self.logger.error(e)
        input("Press Enter to continue...")
예제 #54
0
class CraigslistBot:
    @staticmethod
    def debug(inString):
        print(" [BOT] - %s" % inString.encode('utf-8').strip())

    def __init__(self,
                 protonLogin="",
                 protonPassword="",
                 loginEmail="",
                 loginPass="",
                 contactNumber="",
                 contactName="",
                 postCode="",
                 listingsFile="",
                 waitTime=10,
                 waitTimeBetweenPosts=30):
        self.display = ""

        if not os.name == 'nt':
            self.display = Display(visible=1, size=(1248, 1000))  # 800x600
            self.display.start()

        self.client = webdriver.Firefox()
        self.isLoggedIn = False

        self.protonLogin = protonLogin
        self.protonPassword = protonPassword
        self.loginEmail = loginEmail
        self.loginPass = loginPass
        self.contactNumber = contactNumber
        self.contactName = contactName
        self.postCode = postCode
        self.listingsFile = listingsFile
        self.waitTime = waitTime
        self.waitTimeBetweenPosts = waitTimeBetweenPosts

        self.locationCode = "chi"  #nyc asks for more location data not implement yet s

    def __del__(self):
        if not os.name == 'nt':
            self.display.stop()

        self.client.quit()
        return 0

    def login(self, oneTimeLoginLink=""):
        self.debug("Logging in...")

        if oneTimeLoginLink == "":
            self.client.get("https://accounts.craigslist.org/login")
        else:
            self.client.get(oneTimeLoginLink)

        self.waitForId("inputEmailHandle")

        #self.debug("Inputing information to login screen")

        self.client.find_element_by_css_selector(
            "#inputEmailHandle").send_keys(self.loginEmail)

        self.client.find_element_by_css_selector("#inputPassword").send_keys(
            self.loginPass)

        self.client.find_element_by_id("login").click()

        # if need activation:
        # otl = self.validatePostInEmail()
        # self.login(otl)
        # return

        try:
            self.client.find_element_by_css_selector('.tab')
        except NoSuchElementException:
            self.debug("Not logged in")
            return

        self.debug("Successfully logged in!")

        self.isLoggedIn = True

    def createpost(self, listing):
        if not self.isLoggedIn:
            self.debug("ERROR: You're not logged in!")
            return 0

        #self.debug("Attempting to post this listing:")
        #self.debug(listing.tostring() + "\n")

        #self.debug("Navigating to post page")

        #self.debug("locationCode: " + self.locationCode)
        initialPostUrl = "https://post.craigslist.org/c/" + self.locationCode
        #self.debug("navigating to " + initialPostUrl)
        self.client.get(initialPostUrl)

        self.waitForCss("input[value='1']")

        self.client.find_element_by_css_selector("input[value='1']").click()

        # fso = for sale by owner
        # so  = service offered
        self.client.find_element_by_css_selector("input[value='fso']").click()
        time.sleep(self.waitTime)

        # 199 = computer parts
        # 7   = computers
        # 96  = electronics
        self.client.find_element_by_css_selector("input[value='96']").click()
        time.sleep(self.waitTime)
        """
        self.debug("Trying to fill in email")
        try:
            self.client.find_element_by_css_selector(
                '#FromEMail').send_keys(self.loginEmail)
        except NoSuchElementException:
            self.debug("Not avaliable")
        try:
            self.client.find_element_by_css_selector(
                '#FromEMail').send_keys(self.loginEmail)
        except NoSuchElementException:
            self.debug("Not avaliable")
        """

        #self.debug("Checking 'Okay to contact by phone'")
        self.waitForName("show_phone_ok")
        self.client.find_element_by_name("show_phone_ok").click()
        self.client.find_element_by_name("contact_phone_ok").click()

        #self.debug("Checking 'Okay to contact by text'")
        self.client.find_element_by_name("contact_text_ok").click()

        #self.debug("Filling in contact phone number")
        self.client.find_element_by_name("contact_phone").send_keys(
            self.contactNumber)

        #self.debug("Filling in contact name")
        self.client.find_element_by_name("contact_name").send_keys(
            self.contactName)

        #self.debug("Filling in post title")
        spinName = spintax.spin(listing.name)
        self.client.find_element_by_name("PostingTitle").send_keys(spinName)

        #self.debug("Filling in zip code")
        self.client.find_element_by_id("postal_code").send_keys(self.postCode)

        #self.debug("Filling in post content")
        spinDescription = spintax.spin(listing.description)
        self.client.find_element_by_name("PostingBody").send_keys(
            spinDescription)

        #self.debug("Checking 'Okay to contact for other offers'")
        self.waitForName("contact_ok")
        self.client.find_element_by_name("contact_ok").click()

        # self.debug("Unchecking 'Want a map' if checked")
        # try:
        #    self.client.find_element_by_css_selector("#wantamap:checked")
        # except NoSuchElementException:
        #    self.debug("Not checked")
        # finally:
        #    self.client.find_element_by_css_selector("#wantamap:checked").click()
        # time.sleep(self.waitTime)

        #self.debug("Clicking continue")
        self.client.find_element_by_name("go").click()

        # if "editimage" in self.client.current_url:  # FIX tHIS
        #   self.debug("Clicking continue")
        #   self.client.find_element_by_css_selector('button.done').click()
        # else:
        #   self.debug(
        #      "Could not submit. Maybe a bad email address or phone number")

        #self.debug("Clicking publish")
        self.waitForClass("bigbutton")
        self.client.find_element_by_class_name('bigbutton').click()

        # determine if we need to switch to classic uploading
        time.sleep(self.waitTime)
        if len(self.client.find_elements_by_id('classic')) != 0:
            #self.debug("clicking use classic image uploader")
            self.waitForId("classic")
            time.sleep(self.waitTime)
            self.client.find_element_by_id('classic').click()
            time.sleep(self.waitTime
                       )  # must wait for classic to pop into the viewport

        #self.debug("uploading images")
        self.waitForName("file")
        for imagePath in listing.imagePathList:
            self.debug("Attempting to upload image: " + os.getcwd() + "/" +
                       imagePath)
            self.client.find_element_by_name("file").send_keys(os.getcwd() +
                                                               "/" + imagePath)
            time.sleep(self.waitTime)

        self.debug("Clicking done with images")
        self.waitForClass("bigbutton")
        self.client.find_element_by_class_name('bigbutton').click()

        self.debug("Click publish (again)")
        self.waitForName("go")
        self.client.find_element_by_name('go').click()

        # check if we need to verify the post
        self.debug("Check if the post needs verified")
        time.sleep(self.waitTime)
        htmlText = self.client.find_element_by_css_selector("body").text
        # self.debug(htmlText)
        if "FURTHER ACTION REQUIRED" in htmlText:
            # wait for the email to come through and then verify it
            self.debug("must verify post")
            time.sleep(45)
            self.validatePostInEmail()

        return self.client.find_element_by_css_selector(
            "ul.ul").find_elements_by_css_selector("a")[0].get_attribute(
                "href")

    # region WaitFor methods

    def waitForName(self, name):
        for i in range(0, 30):
            #self.debug("waiting for id \"" + name + "\"...")
            if len(self.client.find_elements_by_name(name)) != 0:
                break
            time.sleep(2)

    def waitForId(self, idName):
        for i in range(0, 30):
            #self.debug("waiting for id \"" + idName + "\"...")
            if len(self.client.find_elements_by_id(idName)) != 0:
                break
            time.sleep(2)

    def waitForCss(self, css):
        for i in range(0, 30):
            #self.debug("waiting for css selector \"" + css + "\"...")
            if len(self.client.find_elements_by_css_selector(css)) != 0:
                break
            time.sleep(2)

    def waitForClass(self, className):
        for i in range(0, 30):
            #self.debug("waiting for class \"" + className + "\"...")
            if len(self.client.find_elements_by_class_name(className)) != 0:
                break
            time.sleep(2)

    # endregion

    def validatePostInEmail(self):
        self.debug("NOW, WE VALIDATE!")
        self.client.get("https://mail.protonmail.com/login")

        self.waitForId("username")
        self.client.find_element_by_id("username").send_keys(self.protonLogin)
        self.client.find_element_by_id("password").send_keys(
            self.protonPassword)
        self.client.find_element_by_id("login_btn").click()

        # we're looking for the first link (our craigslistBot email folder) in the first "menuItem-label" list
        self.waitForClass("menuLabel-item")
        labelItem = self.client.find_elements_by_class_name(
            "menuLabel-item")[0]
        labelLink = labelItem.find_elements_by_css_selector(
            "a")[0].get_attribute('href')
        self.client.get(labelLink)

        # click the newest email
        self.waitForClass("conversation")
        self.client.find_elements_by_class_name("conversation")[0].click()

        # find the newest message in that email
        self.waitForClass("message")
        correctMessage = self.client.find_elements_by_class_name("message")[-1]

        # get the one time link, typically the last link in the list
        self.waitForCss("a")
        oneTimeLink = correctMessage.find_elements_by_css_selector(
            "a")[-1].get_attribute('href')

        # if the last link is a support page, select the second to last link which should be our verification link
        if oneTimeLink == "https://www.craigslist.org/about/scams?lang=en&cc=us":
            oneTimeLink = correctMessage.find_elements_by_css_selector(
                "a")[-2].get_attribute('href')

        # navigate to the verification link
        self.client.get(oneTimeLink)

        # get the new post link. This may be the incorrect link, look into this.
        self.waitForCss("a")
        newPostLink = labelItem.find_elements_by_css_selector(
            "a")[0].get_attribute('href')

        time.sleep(2)

        return newPostLink
예제 #55
0
    def parse(self, response):
        display = Display(visible=0, size=(800, 600))
        display.start()

        url = 'https://www.publicstorage.com/missouri/self-storage-st-charles-mo/63303-self-storage/918?PID=PSLocalSearch&CID=1341&CHID=LL'
        driver = webdriver.Firefox()
        driver.get(url)

        #url2='http://www.a1lockerrental.com/self-storage/mo/st-louis/4427-meramec-bottom-rd-facility/unit-sizes-prices#/units?category=all'
        #driver2 = webdriver.Firefox()
        #driver2.get(url2)
        #html2 = driver.page_source
        #soup2 = BeautifulSoup(html2, 'html.parser')
        #soup.append(soup2)
        #print soup
        items = []
        inside = "Indoor"
        outside = "Outdoor"
        inside_units = ["5 x 5", "5 x 10"]
        outside_units = [
            "5' x 5'", "5' x 10'", "5' x 15'", "8' x 10'", "10' x 10'",
            "10' x 20'", "10' x 25'", "10' x 30'", "10' x 24'", "10' x 15'"
        ]

        #print soup.findAll('span',{"class":"sss-unit-size"})

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        time.sleep(3)
        sizeTagz = soup.findAll('div', {"class": "srp_label srp_font_14"})

        rateTagz = soup.findAll('div', {"class": "srp_label alt-price"})

        specialTagz2 = soup.findAll('div', {"class": "srp_res_clm srp_clm90"})
        specialTagz = soup.findAll('div', {"class": "srp_v-space_10"})
        typesTagz = soup.findAll(
            'ul',
            {"class": "srp_list"},
        )

        yield {
            'date': datetime.datetime.now().strftime("%m-%d"),
            'name': "Public Storage"
        }
        size = []
        for n in range(len(sizeTagz)):
            #print len(sizeTagz)
            print(specialTagz2[n]).get_text()
            #print (rateTagz[n]).get_text()

            if "Outside" in (typesTagz[n]).get_text():
                if (sizeTagz[n]).get_text() in outside_units:
                    if (sizeTagz[n]).get_text() not in size:

                        size.append((sizeTagz[n]).get_text())
                        #size.append(re.findall(r'\d+',(sizeTagz2[n]).get_text()))
                        print "logic hit"

                        yield {
                            #soup.findAll('p',{"class":"icon-bg"})
                            #'name': soup.find('strong', {'class':'high'}).text

                            #.replace('\n', '')
                            "special":
                            "Incomplete",  #re.sub(r"(?<=[a-z])\r?\n"," ",(specialTagz[n]).get_text()),
                            "rate": (rateTagz[n]).get_text(),
                            'size': ((sizeTagz[n]).get_text()),
                            "types": "Outside"
                        }
        driver.close()
예제 #56
0
def parseData(linkedinProfile):
    display = Display(visible=0, size=(800, 600))
    display.start()
    driver = webdriver.Chrome('/usr/local/share/chromedriver')

    profile_link = linkedinProfile
    driver.get(profile_link)

    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')

    with open('data.csv', 'a') as csvWriterFile:
        csvWriter = csv.writer(csvWriterFile)

        # Parse the name of the user
        name = soup.find("h1", {"id": "name"})
        data = []
        data.append(name.text)

        # Parse the summary
        summary = soup.find("section", {"id": "summary"})
        data.append(summary.find("div", {"class": "description"}).text)

        # Parse the skills
        skills = soup.find("ul", {"class": "pills"}).find_all('li')
        skillData = ''
        for skill in skills:
            if 'see-less' in skill.get("class"):
                continue
            if 'see-more' in skill.get("class"):
                continue
            skillData = skillData + " " + skill.a.span.text
        data.append(skillData)

        # Parse the Work Experience
        workExperiences = soup.find("ul", {
            "class": "positions"
        }).find_all('li')

        workExperienceData = []
        for workExperience in workExperiences:
            header = workExperience.header
            tmp = {}
            tmp['role'] = header.find("h4", {"class": "item-title"}).text
            tmp['organisation'] = header.find("h5", {
                "class": "item-subtitle"
            }).text
            tmp['description'] = workExperience.find("p", {
                "class": "description"
            }).text
            workExperienceData.append(tmp)
        data.append(workExperienceData)

        # Parse the projects.
        projects = soup.find_all("li", {"class": "project"})
        projectData = []
        for project in projects:
            tmp = {}
            tmp['title'] = project.header.text
            tmp['description'] = project.p.text
            projectData.append(tmp)
        data.append(projectData)

        csvWriter.writerow(data)
def gethtml1(url_root):
    global driver
    int1 = 0
    html=''
    while int1 < 10:

        # 配合计时函数,开一个信号来计算时间,超时便报错,然后except重新再爬这个网址。要是200次都报错就跳过该网址
        #(有时候因为网络或者模拟浏览器的问题,程序会卡住但不会报错跳出,用这个方法识别超时,手动报出错,catch错误后重新爬)
        #signal.signal(signal.SIGALRM, timed_out)
        #signal.setitimer(signal.ITIMER_REAL, 10, 0)

        try:
            #模拟chrome,并消除浏览器图像化,以便在服务器里运行
            int1 = int1 + 1




            driver.get(url_root.replace('amp;', '').replace('&th=1', ''))

            #打印当前网址
            print(driver.current_url)
            html = driver.page_source.encode('utf-8','ignore').decode()



            if html.find('您输入的网址在我们的网站上无法正常显示网页')!= -1:
                print('good is missing')
                html=""
            elif len(html) < 1024 * 5:
                print('block 2...',len(html))
                raise RuntimeError()
            else:
                while(html.find('抱歉,我们只是想确认一下当前访问者并非自动程序')) != -1:
                    print('block 1...')
                    f= open('tmp/fp.txt','w')
                    f.write(html)
                    f.close()

                    #找正则表达式
                    pi = r'<img src="(.*?)" />'
                    pi_url = re.findall(pi, str(html), re.S | re.M)     #findall下是一个列表
                    #获取图片
                    if len(pi_url) > 0:
                        data = urllib.request.urlopen(pi_url[0]).read()
                        address = 'pi/pi.jpg'
                        w = open(address, 'wb')
                        w.write(data)
                        w.close()

                        de  = main(address)
                    else:
                        print('pi_url is null')
                    #输入验证码
                    driver.find_element_by_id("captchacharacters").send_keys(str(de))
                    # driver.find_element_by_name("继续购物").click()
                    driver.find_element_by_class_name('a-button-text').click()
                    time.sleep(0.5)

                    driver.get(driver.current_url.replace('amp;', '').replace('&th=1', ''))

                    # 打印当前网址
                    print(driver.current_url)
                    html = driver.page_source.encode('utf-8', 'ignore').decode()

            # index = random.randint(0, 15)
            # user_agent = user_agents[index]
            # head = {'User-Agent': user_agent}
            # req = request.Request(url_root, headers=head)
            # response = urllib.request.urlopen(req)
            # html = response.read().decode('utf-8')
            break
        except Exception as e:
            print('time exceeded', int1,e)
            try:
                os.system('rm -rf /tmp/.com.google.Chrome*')
                os.system('rm -rf /tmp/.org.chromium*')
                os.system('pkill -9 chrome')
                os.system('pkill -9 Xvfb')
                os.system('pkill -9 chromedriver')
                os.system('pkill -9 geckodriver')
                print('sleep...')
                time.sleep(random.randint(1,3))
                print('starting')
                display = Display(visible=0, size=(800, 800))
                display.start()
                #driver = webdriver.Firefox()
                driver = webdriver.Chrome()
                driver.delete_all_cookies()
                print('started')
            except Exception as e:
                print('!', e)
    return html
예제 #58
0
class WebAssay:
    """
    This is a base class that is built ontop a Selenium driver.
    
    Inherit from this class to
    1. parse web pages, 
    2. calculate the area and position of elements, and 
    3. stain HTML page for parsed elements.
    
    It can be used as a base class for variants of WebAssay.
    You must implement a `run` function to use the base class.
    """
    def __init__(self,
                 user_agent: str,
                 window_size: tuple,
                 headless=False,
                 parser_functions: List = [],
                 color_palette: Dict = {},
                 warpped_height_px: int = 700,
                 reset_driver_after: int = 50):
        """
        `headless` should be set to True if you want a headless web browser.
        `color_palette` is a dictionary that maps from element category to a 
          hex color.
        `parser_functions` a list of parser functions. 
          Where a parser function takes bs4, and returns a list of dictionaries. 
          Be sure to make one of those keys contains `category`, 
          if you're using a `color_pallette` and want to stain images.
        `warpped_height_px` is the minimum y-distance in pixels to consider 
          an element warpped.
        """
        # functions that take bs4 and return a list of dicts.
        self.parser_functions = parser_functions
        if len(self.parser_functions) == 0:
            raise ValueError("Please assign parser_functions!")

        # browser params
        self.window_size = window_size
        self.width, self.height = window_size
        self.user_agent = user_agent
        self.headless = headless
        self._init_browser()

        # optional params
        self.color_palette = color_palette  # dictionary of category to color.
        self.warpped_height = warpped_height_px  # skip elements whose height exceeds.

        # friends we make along the way
        self.error_files = []  # which files are not parsed correctly?
        self.element_metadata = pd.DataFrame(
        )  # the most recent element metadata.
        self.driver_reset_counter = 0  # driver will reset at `reset_driver_after`.
        self.reset_driver_after = reset_driver_after

    def _init_browser(self):
        """
        Initalizes a selenium browser with proper `user_agent` and window `size`.
        Set `headless` to True to have a headless browser. 
        Keep the default as False to help debug.
        """
        self.display = False
        if self.headless:
            self.display = Display(visible=0,
                                   size=(self.width + 10, self.height + 10))
            self.display.start()

        # Set up user agent
        profile = webdriver.FirefoxProfile()
        profile.set_preference("general.useragent.override", self.user_agent)
        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
        firefox_capabilities['marionette'] = True

        driver = webdriver.Firefox(profile, capabilities=firefox_capabilities)
        driver.set_window_size(*self.window_size)
        self.driver = driver

    def close_driver(self):
        """Closes the driver"""
        self.driver.quit()
        if not isinstance(self.display, bool):
            self.display.stop()

    def restart_driver(self):
        """Restarts drivers and display"""
        self.close_driver()
        self._init_browser()
        self.driver_reset_counter = 0
        time.sleep(2)

    def save_source(self, fn: str):
        """Saves the source code of a page."""
        with open(fn, 'w') as f:
            f.write(self.driver.page_source)

    def screenshot_full(self, fn: str):
        """
        Takes a full screenshot. There are other methods that work
        better with a headless browser (such as expanding the window).
        
        The screenshot is resized to the original dimensions.
        For whatever reason, I get higher res images by the default
        screenshot.
        
        The standard size allows us to mark up the screenshot with the
        element metadata in `paint_abstract_representation`.
        """
        body = self.driver.find_element_by_tag_name('body')
        body.screenshot(fn)

        # resize image
        img = Image.open(fn)
        img.thumbnail((body.rect['width'], 1e6), Image.ANTIALIAS)
        img.save(fn)

    def identify_elements(
            self, body: Union[element.Tag, element.NavigableString]) -> List:
        """ 
        Runs every parser in `self.parser_functions` through the web page.
        The results are appended to the `data` output.
        """
        data = []
        for parser in self.parser_functions:
            results = parser(body)
            data.extend(results)
        return data

    def stain_element(self,
                      xpath: str,
                      category: str,
                      color: str = '#ffffff',
                      opacity: float = 0.7) -> bool:
        """
        Alters the HTML of a page.
        Stains elements located in `xpath` with `color` by overwritting 
        the style attribute.
        Also sets a new param of markup_category = `category`.
        """
        try:
            elm = self.driver.find_element_by_xpath(xpath)
        except:  # couldn't find element
            return False
        if not elm.is_displayed():
            return False
        style = elm.get_attribute('style')
        if elm.tag_name == 'img':
            custom_style = f"background-color: {color} !important; " \
                            "transition: all 0.5s linear;"\
                            "mix-blend-mode: multiply !important;"
            if style:
                style += '; ' + custom_style
            else:
                style = custom_style
            self.driver.execute_script(
                f"arguments[0].setAttribute('markup_category','{category}')",
                elm)
            self.driver.execute_script(
                f"arguments[0].setAttribute('style','{style}')", elm)
            parent = elm.find_element_by_xpath('ancestor::div[1]')
            style_parent = parent.get_attribute('style')
            custom_style = f"background-color: {color} !important; "
            if style_parent:
                style_parent += '; ' + custom_style
            else:
                style_parent = custom_style
            self.driver.execute_script(
                f"arguments[0].setAttribute('style','{style_parent}')", parent)
        else:
            self.driver.execute_script(
                f"arguments[0].setAttribute('markup_category','{category}')",
                elm)
            custom_style = f"background-color: {color} !important; "\
                            "transition: all 0.5s linear;"
            if style:
                style += '; ' + custom_style
            else:
                style = custom_style
            self.driver.execute_script(
                f"arguments[0].setAttribute('style','{style}')", elm)
            all_images = elm.find_elements_by_tag_name('img')
            for img in all_images:
                if img.is_displayed():
                    style = img.get_attribute('style')
                    custom_style = f"background-color: {color} !important; " \
                                    "mix-blend-mode: multiply !important; z-index:99 !important;"
                    if style:
                        style += '; ' + custom_style
                    else:
                        style = custom_style
                    self.driver.execute_script(
                        f"arguments[0].setAttribute('style','{style}')", img)
            all_videos = elm.find_elements_by_tag_name('video')
            for vid in all_videos:
                if vid.is_displayed():
                    style = vid.get_attribute('style')
                    custom_style = f"background-color: {color} !important; " \
                                    "mix-blend-mode: multiply !important; z-index:99 !important;"
                    if style:
                        style += '; ' + custom_style
                    else:
                        style = custom_style
                    self.driver.execute_script(
                        f"arguments[0].setAttribute('style','{style}')", vid)
            if elm.tag_name == 'a':
                all_children_by_xpath = elm.find_elements_by_tag_name("div")
                for child in all_children_by_xpath:
                    if child.is_displayed():
                        style = elm.get_attribute('style')
                        custom_style = f"background-color: {color} !important; "
                        if style:
                            style += '; ' + custom_style
                        else:
                            style = custom_style
                        self.driver.execute_script(
                            f"arguments[0].setAttribute('style','{style}')",
                            child)
        return True

    def calculate_element_area(self, xpath: str) -> Dict:
        """
        Selenium will try to find an element based on the `xpath`.
        If it is found, calculate the `area` that element occupies 
        on first screen (`area`) and whole page (`area_page`).
        
        If the element is warpped or empty, return an empty dict.
        """
        # get the element based on the xpath
        try:
            elm = self.driver.find_element_by_xpath(xpath)
        except:  # couldn't find element
            return {}

        # get dimensions of element
        rect = elm.rect
        # skip warped elements
        if rect['height'] >= self.warpped_height:
            return {'is_warpped': True}

        # adjust the dimensions by clipping if necessay. "Area" is the first screen
        if elm.is_displayed():
            area = calc_area(rect,
                             location=rect,
                             width=self.width,
                             height_bottom=self.height)
            area_page = calc_area(rect, location=rect, width=self.width)
            meta = {
                'xpath': xpath,
                'dimensions': elm.size,
                'location': elm.location,
                'area': area,
                'area_page': area_page,
            }

            return meta

    def open_local_html(self, fn):
        """Opens a local HTML page in the emulator."""
        local_file = 'file://' + os.path.abspath(fn)
        if self.driver.current_url != local_file:
            self.driver.get(local_file)

    def run(self):
        """
        This function must be overwritten in the inherited class.
        
        Should contain the following steps:
        1. Read either the current page on the driver or a local HTML file 
           `fn` into bs4...
           
        2. Identify elements by sending the contents of the HTML through each 
           parser in `parser_functions`. 
           Do this by calling `self.identify_elements()` on the page.
           
        3. For each element, `self.calculate_element_area()`, 
           and optionally `self.stain_element()` if self.stain = True.
           
        4. Assign `self.element_metadata` with the latest element metadata.
        
        And then anything else is up to you.
        """
        raise NotImplementedError
예제 #59
0
def crawl_meta(meta_hdf5=None,
               write_meta_name='data.hdf5',
               crawl_review=False):

    if meta_hdf5 is None:
        # Crawl the meta data from OpenReview
        # Set up a browser to crawl from dynamic web pages
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options

        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(800, 800))
        display.start()

        import time
        executable_path = '/usr/local/bin/chromedriver'
        options = Options()
        options.add_argument("--headless")
        browser = webdriver.Chrome(options=options,
                                   executable_path=executable_path)

        # Load all URLs for all ICLR submissions
        urls = []
        with open('urls.txt') as f:
            urls = f.readlines()
        urls = [url.strip() for url in urls]

        meta_list = []
        wait_time = 0.25
        max_try = 1000
        for i, url in enumerate(urls):
            browser.get(url)
            time.sleep(wait_time)
            key = browser.find_elements_by_class_name("note_content_field")
            key = [k.text for k in key]
            withdrawn = 'Withdrawal Confirmation:' in key
            desk_reject = 'Desk Reject Comments:' in key
            value = browser.find_elements_by_class_name("note_content_value")
            value = [v.text for v in value]

            # title
            title = string.capwords(
                browser.find_element_by_class_name("note_content_title").text)
            author = string.capwords(
                browser.find_element_by_class_name("meta_row").text).split(
                    ', ')

            # abstract
            valid = False
            tries = 0
            while not valid:
                if 'Abstract:' in key:
                    valid = True
                else:
                    time.sleep(wait_time)
                    tries += 1
                    key = browser.find_elements_by_class_name(
                        "note_content_field")
                    key = [k.text for k in key]
                    withdrawn = 'Withdrawal Confirmation:' in key
                    desk_reject = 'Desk Reject Comments:' in key
                    value = browser.find_elements_by_class_name(
                        "note_content_value")
                    value = [v.text for v in value]
                    if tries >= max_try:
                        print('Reached max try: {} ({})'.format(title, url))
                        break
            abstract = ' '.join(value[key.index('Abstract:')].split('\n'))
            # keyword
            if 'Keywords:' in key:
                keyword = value[key.index('Keywords:')].split(',')
                keyword = [k.strip(' ') for k in keyword]
                keyword = [
                    ''.join(string.capwords(k).split(' ')) for k in keyword
                    if not k == ''
                ]
                for j in range(len(keyword)):
                    if '-' in keyword[j]:
                        keyword[j] = ''.join([
                            string.capwords(kk) for kk in keyword[j].split('-')
                        ])
            else:
                keyword = []
            # rating
            rating_idx = [i for i, x in enumerate(key) if x == "Rating:"]
            rating = []
            if len(rating_idx) > 0:
                for idx in rating_idx:
                    rating.append(int(value[idx].split(":")[0]))

            if crawl_review:
                review_idx = [i for i, x in enumerate(key) if x == "Review:"]
                # review = []
                review_len = []
                if len(review_idx) > 0:
                    for idx in review_idx:
                        review_len.append(
                            len([
                                w for w in value[idx].replace('\n', ' ').split(
                                    ' ') if not w == ''
                            ]))
                        # review.append(value[idx])

            # decision
            if 'Decision:' in key:
                decision = value[key.index('Decision:')]
                meta_review = value[key.index('Decision:') + 1]
            else:
                decision = 'N/A'
                meta_review = ''
            meta_review_len = len([
                w for w in meta_review.replace('\n', ' ').split(' ')
                if not w == ''
            ])

            # log
            log_str = '[{}] ratings: {}'.format(
                i + 1,
                rating,
            )
            """
            log_str = '[{}] Abs: {} chars, keywords: {}, ratings: {}'.format(
                i+1, len(abstract), len(keyword), rating,
            )
            if crawl_review:
                log_str += ', review len: {}'.format(review_len)
            """
            log_str += ', meta review len: {}'.format(meta_review_len)
            if not decision == 'N/A':
                log_str += ', decision: {}'.format(decision)
            log_str += '] {}'.format(title)
            log_str += ' by {}'.format(', '.join(author))

            if withdrawn:
                log_str += ' (withdrawn)'
            if desk_reject:
                log_str += ' (desk_reject)'
            print(log_str)

            meta_list.append(
                PaperMeta(
                    title,
                    abstract,
                    keyword,
                    rating,
                    url,
                    withdrawn,
                    desk_reject,
                    decision,
                    author,
                    # None if not crawl_review else review,
                    None if not crawl_review else review_len,
                    meta_review_len,
                ))

        # Save the crawled data
        write_meta(meta_list, write_meta_name)
    else:
        # Load the meta data from local
        meta_list = read_meta(meta_hdf5)
    return meta_list
예제 #60
0
class UITestCase(LiveServerTestCase):
    def use_xvfb(self):
        from pyvirtualdisplay import Display
        self.display = Display('xvfb', visible=1, size=(1280, 1024))
        self.display.start()
        self.driver = WebDriver()

    def setUp(self):
        try:
            self.driver = WebDriver()
            ui_is_not_available = False
        except WebDriverException:
            ui_is_not_available = True

        if ui_is_not_available:
            self.use_xvfb()

        self.driver.implicitly_wait(10)

        clear_caches()
        setup_for_ui_test()

        super(UITestCase, self).setUp()

    def tearDown(self):
        self.driver.quit()
        if hasattr(self, 'display'):
            self.display.stop()

        ContentType.objects.clear_cache()

        super(UITestCase, self).tearDown()

    def click(self, selector):
        self.find(selector).click()

    def click_when_visible(self, selector):
        element = self.find(selector)
        self.wait_until_visible(element)
        element.click()

    def find(self, selector):
        return self.driver.find_element_by_css_selector(selector)

    def find_name(self, name):
        return self.driver.find_element_by_name(name)

    def find_id(self, id):
        return self.driver.find_element_by_id(id)

    def process_login_form(self, username, password):
        username_elmt = self.wait_until_present('[name="username"]')
        password_elmt = self.find_name('password')

        username_elmt.send_keys(username)
        password_elmt.send_keys(password)

        self.click('form * button')

    def browse_to_url(self, url):
        self.driver.get(self.live_server_url + url)

    def browse_to_instance_url(self, url, instance=None):
        instance = instance if instance is not None else self.instance
        self.driver.get('%s/%s/%s' %
                        (self.live_server_url, self.instance.url_name, url))

    def find_anchor_by_url(self, url):
        return self.find("[href='%s']" % url)

    def wait_until_present(self, selector, timeout=10):
        """
        Wait until an element with CSS 'selector' exists on the page.
        Useful for detecting that an operation loads the page you're expecting.
        """
        element = [None]  # use list so it can be set by inner scope

        def is_present(driver):
            element[0] = self.find(selector)
            return element[0] is not None

        WebDriverWait(self.driver, timeout).until(is_present)
        return element[0]

    def wait_until_text_present(self, text, timeout=10):
        """
        Wait until 'text' exists on the page.
        Useful for detecting that an operation loads the page you're expecting.
        """
        WebDriverWait(self.driver,
                      timeout).until(lambda driver: text in driver.page_source)

    def wait_until_enabled(self, element_or_selector, timeout=10):
        """
        Wait until 'element_or_selector' is enabled.
        """
        element = self._get_element(element_or_selector)
        WebDriverWait(self.driver, timeout).until(
            lambda driver: element.get_attribute("disabled") is None)
        return element

    def wait_until_visible(self, element_or_selector, timeout=10):
        """
        Wait until 'element_or_selector' (known to already exist on the page)
        is displayed.
        """
        element = self._get_element(element_or_selector)
        WebDriverWait(self.driver,
                      timeout).until(lambda driver: element.is_displayed())
        return element

    def wait_until_invisible(self, element_or_selector, timeout=10):
        """
        Wait until 'element_or_selector' (known to already exist on the page)
        is not displayed.
        """
        element = self._get_element(element_or_selector)

        def is_invisible(driver):
            try:
                return not element.is_displayed()
            except StaleElementReferenceException:
                return True

        WebDriverWait(self.driver, timeout).until(is_invisible)
        return element

    def _get_element(self, element_or_selector):
        if isinstance(element_or_selector, basestring):
            return self.find(element_or_selector)
        else:
            return element_or_selector