Exemplo n.º 1
0
urls_df = all_urls_df.drop([0,1])
urls_df = urls_df[:5]

app_path = "/usr/local/bin/chromedriver"
#pc_app_path = 
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = Chrome(executable_path = app_path, chrome_options=chrome_options)

for url in urls_df['recipe-url']:
    driver.get(url)

    time.sleep(2)

    # Scrape the title of the recipe
    recipe_title = driver.find_element_by_tag_name('h1').text


    # Scrape the creator-provided description
    descrip = driver.find_element_by_class_name('print-only recipe-layout__description').text


    # Scrape the recipe author
    author_name = driver.find_element_by_class_name('recipe-details__author-link theme-color').text
    author_url = driver.find_element_by_class_name('recipe-details__author')
      need href

    # Scrape the star rating from the style and the number of reviews from the
    # text
    star_rating = driver.find_element_by_class_name("stars-rate__filler")
    rating_val = star_rating.get_attribute("style")
Exemplo n.º 2
0
import selenium
from selenium import webdriver 
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains

senhaarqui="suasenha"
loginarquias="seucpf"
browser = Chrome()
browser.get("https://www.athena.biblioteca.unesp.br/F/RR61ETIMN453GKPD568TA7PDXE6NR6T665RJLJ6XB8FG34BTGV-43921?func=BOR-INFO")
time.sleep(5)
for a in range(0,17):
 browser.find_element_by_tag_name("body").send_keys(Keys.TAB)
time.sleep(5)


actions = ActionChains(browser)

actions.key_down(Keys.ENTER)

actions.perform()
time.sleep(5)
browser.find_element_by_id("pat_id").send_keys(loginarquias)
browser.find_element_by_id("pat_password").send_keys(senhaarqui)
browser.find_element_by_id("pat_password").send_keys(Keys.ENTER) 
time.sleep(5)

for a in range(0,17):
 browser.find_element_by_tag_name("body").send_keys(Keys.TAB)
Exemplo n.º 3
0
class NewVisitorTest(LiveServerTestCase):
    def setUp(self):
        options = ChromeOptions()
        options.add_argument("--no-sandbox")
        self.browser = Chrome(options=options)

    def TearDown(self):
        self.browser.quit()

    def wait_for_row_in_list_table(self, row_text):
        start_time = time.time()
        while True:
            try:
                table = self.browser.find_element_by_id('id_list_table')
                rows = table.find_elements_by_tag_name('tr')
                self.assertIn(row_text, [row.text for row in rows])
                return
            except (AssertionError, WebDriverException) as e:
                if time.time() - start_time > MAX_WAIT:
                    raise e
                time.sleep(0.5)

    def test_can_start_a_list_for_one_user(self):
        # Edith has heard about a cool new online to-do app. She goes
        # to check out its homepage
        self.browser.get(self.live_server_url)

        # She notices the page title and header mention to-do lists
        self.assertIn('To-Do', self.browser.title)
        header_text = self.browser.find_element_by_tag_name('h1').text
        self.assertIn('To-Do', header_text)

        # She is invited to enter a to-do item straight away
        inputbox = self.browser.find_element_by_id('id_new_item')
        self.assertEqual(inputbox.get_attribute('placeholder'),
                         'Enter a to-do item')

        # She types "Buy peacock feathers" into a text box (Edith's hobby
        # is tying fly-fishing lures)
        inputbox.send_keys('Buy peacock feathers')

        # When she hits enter, the page updates, and now the page lists
        # "1: Buy peacock feathers" as an item in a to-do list table
        inputbox.send_keys(Keys.ENTER)
        self.wait_for_row_in_list_table('1: Buy peacock feathers')

        # There is still a text box inviting her to add another item. She
        # enters "Use peacock feathers to make a fly" (Edith is very
        # methodical)
        inputbox = self.browser.find_element_by_id('id_new_item')
        inputbox.send_keys('Use peacock feathers to make a fly')
        inputbox.send_keys(Keys.ENTER)

        # The page updates again, and now shows both items on her list
        self.wait_for_row_in_list_table(
            '2: Use peacock feathers to make a fly')
        self.wait_for_row_in_list_table('1: Buy peacock feathers')

        # Satisfied, she goes back to sleep

    def test_multiple_users_can_start_lists_at_different_urls(self):
        # Edith starts a new to-do list
        self.browser.get(self.live_server_url)
        inputbox = self.browser.find_element_by_id('id_new_item')
        inputbox.send_keys('Buy peacock feathers')
        inputbox.send_keys(Keys.ENTER)
        self.wait_for_row_in_list_table('1: Buy peacock feathers')

        # She notices that her list has a unique URL
        edith_list_url = self.browser.current_url
        self.assertRegex(edith_list_url, '/lists/.+')

        # Now a new user, Francis, comes along to the site.

        ## We use a new browser session to make sure that no information
        ## of Edith's is coming through from cookies etc
        self.browser.quit()
        options = ChromeOptions()
        options.add_argument("--no-sandbox")
        self.browser = Chrome(options=options)

        # Francis visits the home page.  There is no sign of Edith's
        # list
        self.browser.get(self.live_server_url)
        page_text = self.browser.find_element_by_tag_name('body').text
        self.assertNotIn('Buy peacock feathers', page_text)
        self.assertNotIn('make a fly', page_text)

        # Francis starts a new list by entering a new item. He
        # is less interesting than Edith...
        inputbox = self.browser.find_element_by_id('id_new_item')
        inputbox.send_keys('Buy milk')
        inputbox.send_keys(Keys.ENTER)
        self.wait_for_row_in_list_table('1: Buy milk')
        # Francis gets his own unique URL
        francis_list_url = self.browser.current_url
        self.assertRegex(francis_list_url, '/lists/.+')
        self.assertNotEqual(francis_list_url, edith_list_url)
        # Again, there is no trace of Edith's list
        page_text = self.browser.find_element_by_tag_name('body').text
        self.assertNotIn('Buy peacock feathers', page_text)
        self.assertIn('Buy milk', page_text)

        # Satisfied, they both go back to sleep

    def test_layout_and_styling(self):
        # Edith goes to the home page
        self.browser.get(self.live_server_url)
        self.browser.set_window_size(1024, 768)

        # She notices the input box is nicely centered
        inputbox = self.browser.find_element_by_id('id_new_item')
        self.assertAlmostEqual(inputbox.location['x'] +
                               inputbox.size['width'] / 2,
                               512,
                               delta=10)
Exemplo n.º 4
0
# with open('./kbo_link.json','wt') as f:
#     json.dump(kbo_link,f)

### 유투브 댓글 크롤링 ###
kbo_bonki = []
for li in kbo_commentlink:
    delay = 2
    browser = Chrome()
    browser.implicitly_wait(delay)

    start_url = "https://www.youtube.com" + li
    browser.get(start_url)
    browser.maximize_window()
    print(start_url)
    time.sleep(3)
    body = browser.find_element_by_tag_name('body')

    pagedowns = 2  #2번 밑으로
    while pagedowns:
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(2)
        pagedowns -= 1
    time.sleep(3)
    print('@@@@@@@여기까지 정상')

    html0 = browser.page_source
    soup = BeautifulSoup(html0, 'lxml')

    comment_list = soup.find_all('yt-formatted-string',
                                 id='content-text',
                                 limit=5)
Exemplo n.º 5
0
dados = {
    'nome': 'Wagner',
    'email': '*****@*****.**',
    'senha': '123456',
    'telefone': '(00)00000-0000',
}

dict_elementos = {'%40': '@', '%28': '(', '%29': ')'}

dict_url = {}

preencher_formulario(chrome, **dados)
sleep(5)

url_parseada = urlparse(chrome.current_url)
list_query = url_parseada.query.split('&')

for texto in list_query:
    atributo, valor = texto.split('=')
    if atributo != 'btn':
        dict_url[atributo] = valor

for cod, decod in dict_elementos.items():
    for chave, valor in dict_url.items():
        dict_url[chave] = valor.replace(cod, decod)

textarea = chrome.find_element_by_tag_name('textarea')
dict_text = json.loads(textarea.text.replace('\'', '\"'))

assert dict_text == dict_url
Exemplo n.º 6
0
from selenium.webdriver import Chrome

from time import sleep

url = 'https://curso-python-selenium.netlify.app/aula_03.html'

browser = Chrome()
browser.get(url)

sleep(3)

a = browser.find_element_by_tag_name('a')

for click in range(10):
    p = browser.find_elements_by_tag_name('p')
    a.click()
    print(f'Valor de p: {p[-1].text} valor do clock: {click}')
    print(f'Valor de p: {p[-1].text == str(click)}')

print(f'Texto de A: {a.text}')
#browser.quit()
Exemplo n.º 7
0
class InstaBot(object):
    base_url = 'https://www.instagram.com'

    def __init__(self, implicit_wait=20, page_load_timeout=30):
        try:
            Xvfb().start()
        except EnvironmentError:
            pass

        options = ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-setuid-sandbox')

        self.driver = Chrome(settings.CHROMEDRIVER_PATH,
                             chrome_options=options)
        self.driver.implicitly_wait(implicit_wait)
        self.driver.set_page_load_timeout(page_load_timeout)

        self.wait = WebDriverWait(self.driver, settings.WEB_DRIVER_WAIT_SEC)

        self.liked = 0
        self.liked_total_samples = 0
        self.followed = 0

    def close(self):
        try:
            self.driver.delete_all_cookies()
            self.driver.close()

            from subprocess import call
            call(['killall', 'Xvfb'])
            call(['killall', 'chromedriver'])
        except:
            pass

    def login(self, username=None, password=None):
        username = username or os.environ.get('INSTABOT_IG_USERNAME')
        password = password or os.environ.get('INSTABOT_IG_PASSWORD')

        if not username or not password:
            raise InvalidUsernamePasswordError

        logger.info("Logging in as: %s" % username)

        self.driver.get(self.base_url)
        self.wait.until(EC.element_to_be_clickable(
            (By.XPATH, xpath.login))).click()
        self.driver.find_element_by_name('username').send_keys(username)
        self.driver.find_element_by_name('password').send_keys(password)
        self.driver.find_element_by_xpath(xpath.submit_login).click()

    def follow_users(self, usernames=None):
        """
        Follow all the users (don't pass `@')
        """
        for username in usernames:
            time.sleep(settings.FOLLOW_USER_SLEEP_SEC)
            self.driver.get('%s/%s' % (self.base_url, username))
            try:
                elem = self.wait.until(
                    EC.element_to_be_clickable((By.XPATH, xpath.follow)))
                if elem.text.lower() != 'following':
                    elem.click()
                    self.followed += 1
                    logger.info("Started following %s" % username)
                else:
                    logger.info("Already following %s" % username)

            except NoSuchElementException as e:
                logger.info(e)

            except Exception as e:
                logger.error(e)

    def like_tags(self, tags, num=100):
        """
        Like `num' number of posts when exploring hashtag (don't pass `#')

        A random sample of posts will be liked for a given tag
        Return the usernames of the posts liked
        """
        usernames = []
        for tag in tags:
            time.sleep(settings.LIKE_TAG_SLEEP_SEC)
            logger.info("Liking posts with tag: %s" % tag)
            self.driver.get('%s/explore/tags/%s/' % (self.base_url, tag))
            time.sleep(settings.LIKE_TAG_SLEEP_SEC)
            self._load_more(max(1, num / 10))

            # get the actual url's of images to like
            try:
                main = self.driver.find_element_by_tag_name('main')
            except NoSuchElementException as e:
                logger.info(e)
                continue

            links = main.find_elements_by_tag_name('a')
            urls = [link.get_attribute('href') for link in links]

            sample = random.sample(urls, min(num, len(links)))
            self.liked_total_samples += len(sample)
            logger.info("Like sample size: %d" % len(sample))
            for url in sample:
                time.sleep(settings.LIKE_TAG_SLEEP_SEC)
                try:
                    self.driver.get(url)
                    elem = self.driver.find_element_by_link_text('Like')
                    username = self.driver.find_element_by_xpath(
                        xpath.profile_username).text

                    elem.click()
                    self.liked += 1
                    usernames.append(username)

                except NoSuchElementException as e:
                    logger.info(e)

            logger.info("Liked %d/%d" % (self.liked, self.liked_total_samples))

        return usernames

    def _load_more(self, n=10):
        """
        Press "end" key `n' times to load more images
        """
        body = self.driver.find_element_by_tag_name('body')
        for _ in range(n):
            body.send_keys(Keys.END)
            time.sleep(settings.LOAD_MORE_SLEEP_SEC)
Exemplo n.º 8
0
import requests
from bs4 import BeautifulSoup
from time import sleep
import csv
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys

url = "https://www.youtube.com/user/whatsg/videos"
chromedriver = '/Users/mclaren/Downloads/CodingFor/chromedriver'
driver = Chrome(chromedriver)
driver.get(url)
html = driver.find_element_by_tag_name('html')
# title = driver.find_elements_by_id('video-title')
html.send_keys(Keys.END)
# print("-------------sleeping-----------------")
sleep(3)
html.send_keys(Keys.END)
# print("-------------sleeping-----------------")
sleep(3)
title = driver.find_elements_by_id('video-title')

titles = [i.text for i in title]
print(titles)
print("----------------------Number of titles {} ---------------------------".
      format(len(titles)))

driver.close()
Exemplo n.º 9
0
class Scraper:
    """
    Class for an OkCupid Scraper

    Attributes: 
        name (str): alias for the account that will be used to access OKC for scraping
        driver (WebDriver): tool used to get and navigate web pages
        mongoclient (pymongo.mongo_client.MongoClient): mongo database client to store data
        email (str): email of the scraper account
        pw (str): password of the scraper account
        version (str): date string of the datetime when current version was completed.
    """

    def __init__(self, name, headless = True, driverpath=f'{os.getcwd()}/src/chromedriver'):
        """
        Constructor for the Scraper class

        Parameters:
            name (str): alias for the account that will be used to access okc for scraping
            driverpath (str): path to the web driver file
        """
        self.name = name
        opt = Options()
        opt.headless = headless
        self.driver = Chrome(executable_path=driverpath, options=opt)
        self.db = MongoClient('localhost', 27017).okc

        #get email and password from file
        user = pd.read_csv('src/okc_account_credentials', index_col=0).loc[name]
        self.email = user.email
        self.pw = user.pw


        #fetch current version
        record = self.db.scrapers.find_one({'_id': name})
        if record != None:
            self.version = record['current_version']
        else:
            self.version = None
    
    def login(self):
        """
        Logs in to the scraper's account

        """
        self.driver.get('https://www.okcupid.com/login')
        time.sleep(2)

        self.driver.find_element_by_class_name('accept-cookies-button')\
            .click()
        time.sleep(1)

        try:
            self.driver.find_element_by_class_name('login-username')\
                .send_keys(self.email)

            self.driver.find_element_by_class_name('login-password')\
                .send_keys(self.pw)

            self.driver.find_element_by_class_name('login-actions-button')\
                .click()

        #sometimes it's a different login form
        except NoSuchElementException:
            self.driver.find_element_by_name('username')\
                .send_keys(self.email)
            self.driver.find_element_by_name('password')\
            .send_keys(self.pw)
            self.driver.find_element_by_class_name('login2017-actions-button')\
                .click()
        time.sleep(2)


    def logout(self):
        """
        Logs the current scraper out.
        """
        self.driver.get('https://www.okcupid.com/logout')
        self.driver.close()


    def set_first_version(self, question_data):
    #TODO docstring
        #qd = self.getScraperQuestionData()
        dt_now = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.db.scrapers.insert_one({    \
             '_id': self.name,           \
             'current_version': dt_now,  \
             'versions':{                \
                dt_now: question_data    \
        }})
        self.version = dt_now


    def add_questions_update_version(self, new_question_data):
    #TODO docstring
        dt_now = datetime.now().strftime('%Y%m%d_%H%M%S')

        versions = self.db.scrapers.find_one({'_id':self.name})['versions']
        prev_version = versions[self.version]
        versions[dt_now] = Scraper._merge_question_data_versions(prev_version, new_question_data)
        self.db.scrapers.update({'_id':self.name}, {'$set':
            {'versions': versions, 'current_version': dt_now}})
        self.version = dt_now

    def _merge_question_data_versions(prev_qd, new_qd):
        '''
        Returns a complete question-data-list that is the union of the
        two lists, where old versions of the same questions are replaced with new
        versions.
        '''
        ret_dict = {text:question for text, question in map(
            lambda q: (q['q_text'], q), new_qd)}

        for question in prev_qd:
            text = question['q_text']
            if ret_dict.get(text) == None:
                ret_dict[text] = question

        return list(ret_dict.values())


    def get_scraper_question_data(self, wait=1):
    #TODO docstring
        self.driver.get('https://www.okcupid.com/profile')
        time.sleep(wait*3)
        sameq = lambda q1, q2: q1.find_element_by_xpath('button/h3').text ==\
            q2.find_element_by_xpath('button/h3').text
        self.driver.find_element_by_class_name('profile-selfview-questions-more').click()
        time.sleep(wait*3)

        questions = self.driver.find_elements_by_class_name('profile-question')
        i = 0
        current = questions[i]
        datalist = []

        while not sameq(current, questions[-1]):
            for j in range(i,len(questions)):

                qdatum = Scraper.get_data_from_answer_stub(questions[j])
                datalist.append(qdatum)

                '''#open question detail overlay
                questions[j].click()
                #needs a moment to load
                time.sleep(0.7)

                #scrape question overlay. 
                overlay = self.driver.find_element_by_class_name('questionspage')

                #get question text
                text = overlay.find_element_by_tag_name('h1').text

                #get the choices and what our answer was
                our_answer_buttons = overlay.find_element_by_class_name('pickonebutton-buttons')\
                    .find_elements_by_class_name('pickonebutton-button')
                choices = [b.text for b in our_answer_buttons]
                our_answer = list(map(lambda x: x.get_attribute('class')\
                    .endswith('--selected'),our_answer_buttons)).index(True)

                #get which of their answers we will accept
                their_answer_buttons = overlay.find_element_by_class_name('pickmanybuttons')\
                    .find_elements_by_tag_name('input')
                acceptable = list(map(lambda x: x.get_property('checked'), their_answer_buttons))

                #get how important the question is to our scraper
                #should all be 1:somewhat important
                importance_buttons = overlay.find_element_by_class_name('importance-pickonebutton')\
                    .find_elements_by_tag_name('button')
                importance = list(map(lambda b: b.get_attribute('class').endswith('--selected'),\
                    importance_buttons)).index(True)

                #package up the question data
                datalist.append({             \
                    'q_text': text,           \
                    'choices': choices,       \
                    'our_answer': our_answer, \
                    'acceptable': acceptable, \
                    'importance': importance  \
                })

                #exit overlay
                self.driver.find_element_by_class_name('reactmodal-header-close').click()'''

            #adjust loop conditions
            current = questions[j]
            current.location_once_scrolled_into_view
            time.sleep(wait)
            questions = self.driver.find_elements_by_class_name('profile-question')
            for i in range(len(questions)):
                if sameq(current, questions[i]):
                    break

        return datalist


        
    def collect_usernames(self, softlimit=np.inf):
    #TODO docstring
        self.driver.get('https://www.okcupid.com/match')
        time.sleep(2)
        usernames = set()
        while len(usernames) < softlimit:
            try:
                self.driver.find_element_by_class_name('blank-state-wrapper')
                exit_stat = 1
                break

            except NoSuchElementException:
                try:
                    self.driver.find_element_by_class_name('match-results-error')
                    exit_stat = 0
                    break
                except NoSuchElementException:
                    pass
                try:
                    matchcards = self.driver.find_elements_by_class_name('usercard-thumb')
                    last = matchcards[-1]
                    usernames = usernames.union(set(map(\
                        lambda card: card.get_attribute('data-username'), matchcards)))
                    last.location_once_scrolled_into_view
                except StaleElementReferenceException:
                    time.sleep(0.5+np.random.exponential())
        return (usernames, exit_stat)

        
    def scrape_user(self, img_save_dir, username, wait=1.5):
        #TODO docstring        
        #TODO need try-accept block for when user isn't found

        #scrape questions first
        self.driver.get(f'https://www.okcupid.com/profile/{username}/questions')
        time.sleep(wait)

        #if there are any unanswered questions, answer them so we can scrape
        #ALL the user's answered questions later.
        if self.get_num_questions_by_filter('FIND OUT') > 0:
            qdata = self.answer_unanswered_questions()
            self.add_questions_update_version(qdata)

        #scrape the questions the user has answered
        questions = self.scrape_user_questions(username)

        #scrape their main profile contents
        self.driver.get(f'https://www.okcupid.com/profile/{username}')
        time.sleep(wait*np.random.exponential())
        try:
            self.driver.find_element_by_class_name('profile-essays-expander').click()
        except NoSuchElementException: #short profiles
            pass
        html = self.driver.find_element_by_tag_name('HTML').get_attribute('innerHTML')
        
        #scrape images
        img_count = self.save_images(img_save_dir, username)
        
        dtime = datetime.now().strftime('%Y%m%d_%H%M%S'),
        
        #package it all up
        return{                                 \
            '_id': username,                    \
            'html': html,                       \
            'img_count': img_count,             \
            'questions': questions,             \
            'metadata':{                        \
                'time': dtime,                  \
                'scraper': self.name,           \
                'scraper_version': self.version \
            }                                   \
        }


    def answer_question_overlay(self, importance_answer=1, wait=0.1):
    #TODO docstring
        overlay = self.driver.find_element_by_class_name('questionspage')

        #get button arrays
        our_answer_buttons = overlay.find_element_by_class_name('pickonebutton-buttons')\
            .find_elements_by_class_name('pickonebutton-button')
        their_answer_buttons = overlay.find_element_by_class_name('pickmanybuttons')\
            .find_elements_by_tag_name('button')
        importance_buttons = overlay.find_element_by_class_name('importance-pickonebutton')\
            .find_elements_by_tag_name('button')

        #get data to store
        text = overlay.find_element_by_tag_name('h1').text
        choices = [b.text for b in our_answer_buttons]
        answer = int(np.random.uniform() * len(choices))
        acceptable_arr = [False]*len(choices)
        acceptable_arr[answer] = True

        #click the appropriate buttons
        our_answer_buttons[answer].click()
        time.sleep(wait)
        their_answer_buttons[answer].click()
        time.sleep(wait)
        importance_buttons[importance_answer].click()
        time.sleep(wait)
        
        #submit form
        self.driver.find_element_by_class_name('questionspage-buttons-button--answer')\
            .click()

        return{                             \
            'q_text': text,                 \
            'choices': choices,             \
            'our_answer': answer,           \
            'acceptable': acceptable_arr,   \
            'importance': importance_answer \
        }


        
    def answer_unanswered_questions(self, wait=1, importance_answer=1):
    #TODO docstring
        qdata = []
        while self.get_num_questions_by_filter('FIND OUT') > 0:
            try:
                self.driver.find_element_by_class_name('profile-questions-filter-icon--findOut')\
                    .click()
                time.sleep(wait)
                
                self.driver.find_element_by_class_name('profile-question')\
                    .click()
                time.sleep(wait)
                
                qdata.append(self.answer_question_overlay(importance_answer))
                time.sleep(wait)

            except NoSuchElementException:
                wait += 0.1
                time.sleep(wait)
                continue

        return qdata
                
                
    def get_num_questions_by_filter(self, filterstr):
    #TODO docstring
        arr = self.driver.find_element_by_class_name('profile-questions-filters')\
            .text.split('\n')
        return int(arr[arr.index(filterstr)+1])


    def scroll_to_bottom(self, wait):
    #TODO docstring
        body = self.driver.find_element_by_tag_name('body')
        y = [0,1]
        while y[0] != y[1]:
            y[0] = y[1]
            body.send_keys(Keys.END)
            time.sleep(wait)
            y[1] = self.driver.execute_script('return window.pageYOffset;')
            
            
    def scrape_user_questions(self, username):
    #TODO docstring
        q=dict()
        for filterstr in ['AGREE', 'DISAGREE']:
            time.sleep(1)
            q[filterstr] = self.scrape_user_questions_by_filter(filterstr)
        return q


    def scrape_user_questions_by_filter(self, filterstr, wait=0.3):
    #TODO docstring
        self.driver.find_element_by_tag_name('body')\
            .send_keys(Keys.HOME)
        time.sleep(0.7+wait)
        self.driver.find_element_by_class_name(f'profile-questions-filter-icon--{filterstr.lower()}')\
            .click()
        time.sleep(0.7+wait)
        numQsToScrape = self.get_num_questions_by_filter(filterstr) 

        self.scroll_to_bottom(wait)

        questions = self.driver.find_elements_by_class_name('profile-question')
        while len(questions) != numQsToScrape:
            wait += 0.1
            self.scroll_to_bottom(wait)
            questions = self.driver.find_elements_by_class_name('profile-question')
        return [q.get_attribute('innerHTML') for q in questions]


    def get_src(img):
    #TODO docstring
        src = img.get_attribute('src')
        if src is None:
            src = img.get_attribute('data-src')
        return src


    def save_images(self, save_dir, username):
    #TODO docstring
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        images = self.driver.find_element_by_class_name('profile-thumb')\
            .find_elements_by_tag_name('img')
        images.extend(self.driver.find_element_by_class_name('profile-essays')\
            .find_elements_by_tag_name('img'))

        for url in map(Scraper.get_src,images):
            i = requests.get(url).content
            name = urlparse(url).path.split('/')[-1]
            with open(f'{save_dir}/{username}_{name}', 'wb') as f:
                f.write(i)
        return len(images)


    def answer_all_questions(self, importance_answer=1, wait=1):
    #TODO docstring
        self.driver.get('https://www.okcupid.com/profile')
        time.sleep(2*wait+np.random.exponential())
        self.driver.find_element_by_class_name('profile-selfview-questions-more')\
            .click()
        time.sleep(2*wait+np.random.exponential())

        qdata=[]
        self.driver.find_element_by_class_name('profile-questions-next-actions-button--answer')\
            .click()
        count=0

        while True:
            try:
                time.sleep(wait)
                qdatum = self.answer_question_overlay(importance_answer)
                qdata.append(qdatum)
                count += 1
            except NoSuchElementException:
                try:
                    self.driver.find_element_by_id('no-questions-blank-state')
                    exit_stat = 'reached end of questions.'
                    break
                except NoSuchElementException:
                    wait += 0.1
                    time.sleep(wait)
                    continue
            except Exception as e:
                exit_stat = f'Error: {str(e)}'
                break
        return (qdata, exit_stat)


    def answer_initial_question(self, wait=1):
    #TODO docstring
        qtext = self.driver.find_element_by_class_name('convoanswers-text')\
            .text
        choicebuttons = self.driver\
            .find_element_by_class_name('convoanswers-answers')\
            .find_elements_by_tag_name('button')
        choicestext = [b.text for b in choicebuttons]
        answer = int(np.random.uniform() * len(choicestext))
        choicebuttons[answer]\
            .click()
        time.sleep(wait)

        choicebuttons = self.driver\
            .find_element_by_class_name('convoanswers--theirs')\
            .find_elements_by_tag_name('button')
        choicebuttons[answer]\
            .click()
        time.sleep(wait)
        acceptable = [False]*len(choicestext)
        acceptable[answer] = True

        self.driver.find_element_by_class_name('convoquestion-continue')\
            .click()
        return {                      \
            'q_text': qtext,          \
            'choices': choicestext,   \
            'our_answer': answer,     \
            'acceptable': acceptable, \
            'importance': 1           \
        }
        #TODO verify the assumed importance answer is right


    def answer_all_initial_questions(self, wait=1):
    #TODO docstring
        qdata = []
        current_q, num_qs = self.get_progress()
        for i in range(num_qs - current_q+1):
            qdata.append(self.answer_initial_question(wait))
            time.sleep(wait*2)
        return qdata


    def get_progress(self):
    #TODO docstring
        return tuple(map(int, self.driver\
            .find_element_by_class_name('obqconvo-progress-text')\
            .text.split(' of ')))
    
    def save_usernames_to_mongo(self, usernames):
        self.db.usernames.insert_many(map(lambda u: {'_id':u}, usernames))


    def get_data_from_answer_stub(stub):
        text = stub.find_element_by_tag_name('h3').text
        answer_divs = stub.find_elements_by_class_name('profile-question-self-answers-answer')
        choices = [a.text for a in answer_divs]
        our_answer = [a.get_attribute('class').endswith('--isYourAnswer')\
                for a in answer_divs].index(True)
        acceptable = [not a.get_attribute('class').endswith('--isUnacceptable') for a in answer_divs]
        
        return {                       \
            'q_text':text,             \
            'choices': choices,        \
            'our_answer': our_answer,  \
            'acceptable': acceptable,  \
            'importance': 1            \
        } 
Exemplo n.º 10
0
from selenium.webdriver import Chrome

# mode headless

url = 'http://selenium.dunossauro.live/exercicio_01.html'
navegador = Chrome()
navegador.get(url)

h1 = navegador.find_element_by_tag_name('h1').text
ps = navegador.find_elements_by_tag_name('p')

attrs = []
texts = []

for p in ps:
    attrs.append(p.get_attribute('atributo'))

for p in ps:
    texts.append(p.text)

print({h1: dict(zip(attrs, texts))})
Exemplo n.º 11
0

#Função que encontra o titulo da pagina.
def find_a_by_content(browser, content):
    elementos = browser.find_elements_by_tag_name('a')
    for elemento in elementos:
        if elemento.text == content:
            return elemento


browser = Chrome()
browser.get(url)

# Pagina 1
sleep(3)
main = browser.find_element_by_tag_name('main')
main.find_element_by_tag_name('a').click()

# Pagina 2
sleep(3)
main = browser.find_element_by_tag_name('main')
find_a_by_attr(main, 'attr', 'errado').click()

# Pagina 3
sleep(10)
browser.refresh()
sleep(2)
main = browser.find_element_by_tag_name('main')
titulo = browser.title
find_a_by_content(main, browser.title).click()
Exemplo n.º 12
0
#最初に戻る
back = driver.find_element_by_class_name("start")
back.click()

#図鑑に画面遷移
picbook = driver.find_element_by_class_name("picture_book")
picbook.click()

#図鑑で特定の生物を指定し、画面遷移
#setumei = driver.find_element_by_xpath("//input[@value='1']").click()
setumei = driver.find_element_by_xpath(
    "//input[@src='../static/imgs/1.png']").click()

#図鑑に戻る
back = driver.find_element_by_class_name("start")
back.click()

#最初に戻る
last = driver.find_element_by_tag_name("a")
last.click()

#タイトルが表示されるまでまつ
time.sleep(3)

#スクショをとる
driver.save_screenshot('chrome_abunator_last.png')

#ブラウザ閉じる
driver.quit()
Exemplo n.º 13
0
from selenium.webdriver import Chrome

browser = Chrome()

browser.get('https://selenium.dunossauro.live/aula_04_a.html')

lista_nao_ordenada = browser.find_element_by_tag_name('ul') # 1


lis = browser.find_elements_by_tag_name('li') #2

lis[0].find_element_by_tag_name('a').text #3

""" 
1 . buscamos ul
2. buscamos todos li
3. no primeiro li, buscamos  `a` e pegamos o seu  texto

ul 
    li
        a
            texto
    li
        a
            texto
            
"""

Exemplo n.º 14
0
from selenium.webdriver import Chrome
from time import sleep

url = 'https://curso-python-selenium.netlify.app/exercicio_02.html#'

navegador = Chrome()
navegador.get(url)

sleep(3)

a = navegador.find_element_by_tag_name('a')
p = navegador.find_elements_by_tag_name('p')
a.click()
if (p[1] == p[-1]):
    print(a.text)
    print(p.text)
Exemplo n.º 15
0
def scroll_page(driver: Chrome, height: str):
    driver.find_element_by_tag_name("body").send_keys(Keys.END)
    sleep(3)
    if height != driver.execute_script("return document.body.scrollHeight"):
        scroll_page(driver, driver.execute_script("return document.body.scrollHeight"))
Exemplo n.º 16
0
def test_input(browser: Chrome, enable_percy=False):
    """测试输入::

        run template.basic_input()
        actions(['Continue'])
        template.background_input() # 或者 await template.coro_background_input() / flask_coro_background_input

    """
    browser.find_element_by_css_selector('input').send_keys("22")
    browser.find_element_by_tag_name('form').submit()

    time.sleep(0.5)
    browser.find_element_by_css_selector('input').send_keys("secret")
    browser.find_element_by_tag_name('form').submit()

    time.sleep(0.5)
    browser.find_element_by_tag_name('form').submit()

    # checkbox
    time.sleep(0.5)
    browser.execute_script("arguments[0].click();",
                           browser.find_element_by_css_selector('input'))
    browser.find_element_by_tag_name('form').submit()

    # Text Area
    time.sleep(0.5)
    browser.find_element_by_css_selector('textarea').send_keys(" ".join(
        str(i) for i in range(20)))
    browser.find_element_by_tag_name('form').submit()

    # file
    time.sleep(0.5)
    img_path = path.join(here_dir, 'assets', 'img.png')
    browser.find_element_by_css_selector('input').send_keys(img_path)
    browser.find_element_by_tag_name('form').submit()

    # text
    time.sleep(0.5)
    browser.find_element_by_css_selector('input').send_keys("text")
    browser.find_element_by_tag_name('form').submit()

    # 表单取消
    time.sleep(0.5)
    browser.execute_script(
        "arguments[0].click();",
        browser.find_element_by_css_selector('.pywebio_cancel_btn'))

    # valid func, age in [10, 60]
    time.sleep(0.5)
    browser.find_element_by_css_selector('input').send_keys("1")
    browser.find_element_by_tag_name('form').submit()
    time.sleep(0.5)
    browser.find_element_by_css_selector('input').clear()
    browser.find_element_by_css_selector('input').send_keys("90")
    browser.find_element_by_tag_name('form').submit()
    time.sleep(0.5)
    browser.find_element_by_css_selector('input').clear()
    browser.find_element_by_css_selector('input').send_keys("23")
    browser.find_element_by_tag_name('form').submit()

    # code
    time.sleep(0.5)
    # browser.find_element_by_css_selector('textarea').send_keys(" ".join(str(i) for i in range(20)))
    browser.find_element_by_tag_name('form').submit()

    # Cancelable from group
    time.sleep(0.5)
    browser.find_element_by_name('name').send_keys("name")
    time.sleep(1)
    browser.find_element_by_name('age').send_keys("90")
    browser.find_element_by_tag_name('form').submit()
    browser.execute_script(
        '$("html, body").scrollTop( $(document).height()+100);')
    time.sleep(0.5)
    enable_percy and percySnapshot(browser=browser, name='input group invalid')

    time.sleep(0.5)
    browser.find_element_by_name('age').clear()
    browser.find_element_by_name('age').send_keys("23")
    browser.find_element_by_tag_name('form').submit()

    # callback actions
    time.sleep(0.5)
    browser.execute_script(
        "arguments[0].click();",
        browser.find_element_by_css_selector('form button[type="button"]'))
    time.sleep(0.4)

    # input action
    time.sleep(0.5)
    browser.execute_script(
        "arguments[0].click();",
        browser.find_element_by_css_selector('form button[type="button"]'))
    time.sleep(0.4)
    browser.find_element_by_tag_name('form').submit()

    # Input group
    time.sleep(0.5)
    browser.execute_script(
        '$("html, body").scrollTop( $(document).height()+100);')
    time.sleep(0.5)
    enable_percy and percySnapshot(browser=browser, name='input group all')
    browser.find_element_by_name('text').send_keys("name")
    browser.find_element_by_name('number').send_keys("20")
    browser.find_element_by_name('float').send_keys("3.1415")
    browser.find_element_by_name('password').send_keys("password")
    browser.find_element_by_name('textarea').send_keys(" ".join(
        str(i) for i in range(20)))
    # browser.find_element_by_css_selector('[name="code"]').send_keys(" ".join(str(i) for i in range(10)))
    Select(browser.find_element_by_name('select-multiple')).select_by_index(0)
    # browser. find_element_by_css_selector('[name="select"]'). send_keys("name")
    # browser. find_element_by_css_selector('[name="checkbox-inline"]'). send_keys("name")
    # browser. find_element_by_css_selector('[name="checkbox"]'). send_keys("name")
    # browser. find_element_by_css_selector('[name="radio-inline"]'). send_keys("name")
    # browser. find_element_by_css_selector('[name="radio"]'). send_keys("name")
    browser.find_element_by_name('file_upload').send_keys(
        path.join(here_dir, 'assets', 'helloworld.txt'))

    browser.execute_script("$('form button').eq(1).click()")
    time.sleep(1)
    browser.execute_script(
        '$("html, body").scrollTop( $(document).height()+100);')
    time.sleep(0.5)
    enable_percy and percySnapshot(browser=browser,
                                   name='input group all invalid')

    browser.find_element_by_name('password').clear()
    browser.find_element_by_name('password').send_keys("123")
    browser.execute_script("$('form button').eq(1).click()")
    time.sleep(1)
    browser.execute_script(
        '$("html, body").scrollTop( $(document).height()+100);')
    time.sleep(1)
    enable_percy and percySnapshot(browser=browser,
                                   name='input group all submit')

    browser.find_element_by_css_selector('form').submit()

    # background
    time.sleep(3)
    get_visible_form(browser).find_element_by_css_selector('input').send_keys(
        "background")
    get_visible_form(browser).find_element_by_tag_name('form').submit()
    # front
    time.sleep(0.5)
    get_visible_form(browser).find_element_by_css_selector('input').send_keys(
        "front")
    get_visible_form(browser).find_element_by_tag_name('form').submit()
def son():
    print('월클손흥민')
## 유투브 재생을 위한 링크, 제목 크롤링 ##
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome('./chromedriver.exe', options=chrome_options)
    driver.get("https://www.youtube.com/")
    time.sleep(1)


    driver.find_element_by_xpath('//*[@id="search"]')
    driver.find_element_by_xpath('//*[@id="search"]').send_keys('KBO 레전드')
    driver.find_element_by_xpath('//*[@id="search"]').send_keys(Keys.ENTER)
    time.sleep(1)

    url = driver.current_url
    # url = "https://www.youtube.com/results?search_query=KBO+%EB%A0%88%EC%A0%84%EB%93%9C"
    # print(url)
    response = urllib.request.urlopen(url)
    soup = BeautifulSoup(response, 'lxml')
    # print(response)

    results = soup.select('h3 > a')
    # print(type(results))
    result = results[0:10]
    # print(results)
    kbo_title=[]
    kbo_link=[]
    kbo_commentlink=[]  ##댓글을 크롤링을 위한 주소
    for video in result:
        # print(video)
        # link = video.attrs['href'].replace('/watch?v=','/embed/')
        link = video.attrs['href']  #링크 크롤링
        title = video.attrs['title'] #제목 크롤링
        # print(link, title)
        kbo_commentlink.append(link)
        kbo_link.append(link.replace('/watch?v=','/embed/'))
        kbo_title.append(title)
    # print(kbo_link)
    # print(kbo_commentlink)    
    driver.close()

    with open('./kbo_title_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f:
        json.dump(kbo_title,f)    
    with open('./kbo_link_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f:
        json.dump(kbo_link,f)    


### 유투브 댓글 크롤링 ###
    kbo_comments=[]
    for li in kbo_commentlink:
        delay = 3
        browser = Chrome()
        browser.implicitly_wait(delay)
        start_url="https://www.youtube.com" + li
        browser.get(start_url)
        browser.maximize_window()
        print(start_url)
        time.sleep(3)
        body = browser.find_element_by_tag_name('body')

        pagedowns = 2  #2번 밑으로
        while pagedowns:
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
            pagedowns -= 1
        time.sleep(3)    
        # print('@@@@@@@여기까지 정상')


        html0 = browser.page_source
        soup = BeautifulSoup(html0, 'lxml')

        comment_list = soup.find_all('yt-formatted-string', id='content-text', limit=5)
        comments=[]
        for list in comment_list:
            comment = list.text
            comments.append(comment) # 영상마다 1차 리스트 생성
        print(comments,'@@@1차리스트@@')
        kbo_comments.append(comments) # 영상 다 합쳐서 2차 리스트 생성
        browser.close()
        
    print(kbo_comments, '####2차리스트####')


    with open('./kbo_comments_'+ str(time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))) +'.json','wt') as f:
        json.dump(kbo_comments,f)