Exemplo n.º 1
0
 def reset(self):
     self.counter = 0
     for bot in self.bots_list:
         webdriver = bot.state.form_element.parent
         webdriver.close()
     self.bots_list = []
     self.create_bots()
Exemplo n.º 2
0
def RespCode(domain):
    webdriver = Chrome()
    domain = "http://www." + str(domain)
    response = webdriver.request('GET', domain)
    response = str(response)[11:][:-2]
    webdriver.close()
    return response
Exemplo n.º 3
0
def menu():
    menu = {}
    menu['1'] = "\033[0;33mWatch Instagram Stories\033[m"
    menu['2'] = "\033[0;33mLike Hashtagged posts\033[m"
    menu['3'] = "\033[0;33mEdit hashtag list\033[m"
    menu['4'] = "\033[0;33mExit\033[m"
    while True:
        options = menu.keys()
        for entry in options:
            print(entry, menu[entry])

        selection = str(input("What would you like to do? "))
        if selection == '1':
            os.system('clear')
            watchstories()
        elif selection == '2':
            os.system('clear')
            likes()
        elif selection == '3':
            os.system('clear')
            hashtag_menu()
        elif selection == '4':
            os.system('clear')
            webdriver.close()
            sys.exit()
        else:
            print("\nYou have to choose an option between 1 and 4. \n")
            menu()
Exemplo n.º 4
0
    def scrape(self):

        try:
            print(
                "scraping www.cnn.com , please wait as it might take a while :)"
            )

            webdriver.get('https://edition.cnn.com/')
            element = WebDriverWait(webdriver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "cn__title ")))
            source = webdriver.page_source
            soup = BeautifulSoup(source, 'html.parser')

            article_links = []
            for ul in soup.find_all('ul'):
                if (
                        ul.h2 and 'Top' in ul.h2.string
                ):  #if the ul element has a h2 child, and Top is in the h2 string
                    for li in ul.find_all('li'):
                        article_links.append(li.find('a').get('href'))

            self.scrape_from_articles(article_links)

        except TimeoutException as exception:
            print('timed out, check internet connection and try again')

        finally:
            print('done')
            webdriver.close()
            return ''
Exemplo n.º 5
0
def scrape_page(webdriver, links, username):
        '''This function will go to all links provided
        and scrape each picture for the number of likes
        and the caption. If the link is a video no information is recorded. 
        The function will only save the caption if the title is the 
        identified user
        
        Parameters: the active webdriver, a set of picture links, 
        the username of the page your are scraping

        Returns: a list of lists with the number of likes and caption
        '''
        picture_info = []

        for link in links:
                # Open new tab
                webdriver.execute_script("window.open('');")
                time.sleep(3)

                # Switch to the new window
                webdriver.switch_to.window(webdriver.window_handles[1])
                webdriver.get(link)
                time.sleep(5)
                try:
                        likes_list = webdriver.find_elements_by_class_name('zV_Nj')

                        if len(likes_list) != 0: #If the length is 0, then it is a video
                                
                                if len(likes_list) == 1: #No common friends liked the photo
                                        num_likes = webdriver.find_elements_by_class_name('Nm9Fw')[0].text.split(' ')[0]
                                
                                else:
                                        num_likes = int(likes_list[1].text.split(' ')[0]) + 1

                                try:
                                        title = webdriver.find_element_by_class_name('_6lAjh').text
                                        if title == username:
                                                caption_list = webdriver.find_elements_by_xpath("//div[@class='C4VMK']//span")
                                                
                                                '''This code works but not active since I did not use the information
                                                num_of_comments = len(caption_list)'''
                                                
                                                caption = caption_list[0].text
                                        else:
                                                caption = None #if the user was not the title
                                except:
                                        caption = None #photo does not have a caption or any comments
                                        

                                picture_info.append([num_likes, caption])
                except:
                        pass
                webdriver.close()
                
                # Switch focus back to main tab
                webdriver.switch_to.window(webdriver.window_handles[0])    
                time.sleep(5)        
       
        return picture_info
Exemplo n.º 6
0
def clean_up(head=None,webdriver=None):
    print "Cleaning up..."

    if head:
        print "Restoring original head (%s)" % head
        head.checkout()
        
    if webdriver:
        print "Closing browser"
        webdriver.close()
def take_screenshot(url):
    try:
        webdriver = selenium.webdriver.PhantomJS('vendor/phantomjs/bin/phantomjs')
        webdriver.get(url)
        webdriver.set_window_size(1280,800)
        imagedata = webdriver.get_screenshot_as_base64()
        webdriver.close()
        webdriver.quit()
    except Exception, e:
        raise
Exemplo n.º 8
0
def wait_for_content(webdriver, id, timer=15.0, poll_frequency=0.5):
    """ (WebDriver object, str[, float, float]) -> NoneType
    Wait for page to load content with id we specified as function's argument.
    Stop script if time is out.
    """
    try:
        WebDriverWait(webdriver, timer, poll_frequency).until(EC.presence_of_element_located((By.ID, id)))
    except TimeoutException:
        print('Content on the page has not been loaded... Stopping script execution.')
        webdriver.close()
        sys.exit()
Exemplo n.º 9
0
 def make_decorator():
     print('现在开始装饰')
     try:
         func()
         stepresult = 'Pass'
     except:
         stepresult = 'Error'
     finally:
         print('现在结束装饰')
         print(stepresult)
         webdriver.close()
Exemplo n.º 10
0
def main():
    webdriver = setup_webdriver()
    evaluate_trade(webdriver, 'Alteration', buyAlteration, sellAlteration)
    evaluate_trade(webdriver, 'Fusing', buyFusing, sellFusing)
    evaluate_trade(webdriver, 'Scouring', buyScouring, sellScouring)
    evaluate_trade(webdriver, 'Vaal', buyVaal, sellVaal)
    evaluate_trade(webdriver, 'Regret', buyRegret, sellRegret)
    evaluate_trade(webdriver, 'Chiesel', buyChiesel, sellChiesel)
    evaluate_trade(webdriver, 'Alchemy', buyAlchemy, sellAlchemy)
    evaluate_trade(webdriver, 'Jeweller', buyJeweller, sellJeweller)
    webdriver.close()
Exemplo n.º 11
0
def session_end(driver, sessions_to_do, file, questions, answers):
    sessions_to_do = sessions_to_do - 1
    if sessions_to_do == 0:
        write_words(questions, answers, file)
        webdriver.close()
        exit()
    print(str(sessions_to_do) + " sessions left\n")
    driver.find_element(By.ID, "return_mainpage").click()
    WebDriverWait(driver, 8).until(
        ec.presence_of_element_located((By.CLASS_NAME, "btn-session")))
    driver.find_element(By.CLASS_NAME, "btn-session").click()
    return sessions_to_do
Exemplo n.º 12
0
def timerlock():
    # =====================================
    speak("For how many seconds?")
    seconds = int(takeCommand().lower())
    speak("timer set..")
    for i in range(seconds):
        print(str(seconds - i) + " remaining")

        time.sleep(1)
    # ==========================================

    subprocess.Popen("rundll32.exe user32.dll,LockWorkStation")
    webdriver.close()
    exit()
Exemplo n.º 13
0
def cleanUpAfterError(error=None, webdriver=None):
    """
    Overview
        例外エラー後の一連処理を行う。
    Args
        webdriver: 終了させるwebdriverインスタンス。未起動時は引数なし
    Return
        なし
    """
    if webdriver is not None:
        webdriver.close()
    if error is not None:
        logger.exception(str(getCurLineNo()) + ' ' + str(error))
    sys.exit()
Exemplo n.º 14
0
    def do_testharness(self, webdriver, url, timeout):
        format_map = {
            "abs_url": url,
            "url": strip_server(url),
            "window_id": self.window_id,
            "timeout_multiplier": self.timeout_multiplier,
            "timeout": timeout * 1000
        }

        parent = webdriver.current_window_handle
        handles = [item for item in webdriver.window_handles if item != parent]
        for handle in handles:
            try:
                webdriver.switch_to_window(handle)
                webdriver.close()
            except exceptions.NoSuchWindowException:
                pass
        webdriver.switch_to_window(parent)

        webdriver.execute_script(self.script % format_map)
        try:
            # Try this, it's in Level 1 but nothing supports it yet
            win_s = webdriver.execute_script("return window['%s'];" %
                                             self.window_id)
            win_obj = json.loads(win_s)
            test_window = win_obj["window-fcc6-11e5-b4f8-330a88ab9d7f"]
        except Exception:
            after = webdriver.window_handles
            if len(after) == 2:
                test_window = next(iter(set(after) - set([parent])))
            elif after[0] == parent and len(after) > 2:
                # Hope the first one here is the test window
                test_window = after[1]
            else:
                raise Exception("unable to find test window")
        assert test_window != parent

        handler = CallbackHandler(webdriver, test_window, self.logger)
        while True:
            result = webdriver.execute_async_script(self.script_resume %
                                                    format_map)
            done, rv = handler(result)
            if done:
                break
        return rv
Exemplo n.º 15
0
    def do_testharness(self, webdriver, url, timeout):
        format_map = {"abs_url": url,
                      "url": strip_server(url),
                      "window_id": self.window_id,
                      "timeout_multiplier": self.timeout_multiplier,
                      "timeout": timeout * 1000}

        parent = webdriver.current_window_handle
        handles = [item for item in webdriver.window_handles if item != parent]
        for handle in handles:
            try:
                webdriver.switch_to_window(handle)
                webdriver.close()
            except exceptions.NoSuchWindowException:
                pass
        webdriver.switch_to_window(parent)

        webdriver.execute_script(self.script % format_map)
        try:
            # Try this, it's in Level 1 but nothing supports it yet
            win_s = webdriver.execute_script("return window['%s'];" % self.window_id)
            win_obj = json.loads(win_s)
            test_window = win_obj["window-fcc6-11e5-b4f8-330a88ab9d7f"]
        except:
            after = webdriver.window_handles
            if len(after) == 2:
                test_window = next(iter(set(after) - set([parent])))
            elif after[0] == parent and len(after) > 2:
                # Hope the first one here is the test window
                test_window = after[1]
            else:
                raise Exception("unable to find test window")
        assert test_window != parent

        handler = CallbackHandler(webdriver, test_window, self.logger)
        while True:
            result = webdriver.execute_async_script(
                self.script_resume % format_map)
            done, rv = handler(result)
            if done:
                break
        return rv
Exemplo n.º 16
0
def create_user(first_name,
                last_name,
                email,
                user_name,
                type,
                profile,
                role=''):
    browser.get(baseurl)
    try:
        browser.implicitly_wait(5)
        open_new_record()
    except:
        print "Unexpected error 1:", sys.exc_info()[0]
        #Wait for page to load.
        browser.implicitly_wait(10)
        browser.find_element_by_name("new").click()

    try:
        fill_out_form(first_name,
                      last_name,
                      email,
                      user_name,
                      type,
                      profile,
                      role='')
    except:
        print "Unexpected error 2:", sys.exc_info()[0]

    try:
        browser.implicitly_wait(15)
        displayed = browser.find_element_by_id('errorDiv_ep').is_displayed()
        print displayed
        if displayed:
            the_error = browser.find_element_by_class_name('errorMsg').text
            print 'Oh no there is an error! \n'
            print the_error
        else:
            print 'We are good!'
            webdriver.close()
    except common.exceptions.NoSuchElementException, e:
        print '3', e
Exemplo n.º 17
0
def get_table_rows(webdriver, csvwriter):
    """ (WebDriver object, _csv.writer object) -> NoneType
    Search content of the page for 'player-data' table. Collect table's rows and write them in csv file.
    """
    global PAGE_COUNT
    try:
        soup = BeautifulSoup(webdriver.page_source, 'html.parser')
    except AttributeError:
        print('Was not able to parse content of the page. Stopping script execution.')
        webdriver.close()
        sys.exit()
    salary_table = soup.find('table', {'id': 'player-data'})
    rows = salary_table.find('tbody').findAll('tr')
    for row in rows:
        row_data = []
        for cell in row.findAll('td'):
            row_data.append(cell.get_text().strip())
        csvwriter.writerow(row_data)
    print('Page %s is collected.' % PAGE_COUNT)
    PAGE_COUNT += 1
    return
Exemplo n.º 18
0
def report():
    print("Script finished!")
    time_end = datetime.datetime.now()
    sleep(1)
    print("Script Started at: {}".format(
        time_start.strftime("%Y-%m-%d %H:%M:%S")))
    sleep(1)
    print("Script Ended at: {}".format(time_end.strftime("%Y-%m-%d %H:%M:%S")))
    sleep(1)
    print("Total Stories watched from {}'s followers: {}".format(
        limits.user_watch_followers_stories, stories_bf_watched))
    sleep(1)
    print("Total Stories watched from feed: {}".format(total_stories_watched))
    sleep(1)
    print("Total suggestions followed: {}".format(total_suggestion_followed))
    sleep(1)
    print("Total number of likes: {}".format(total_likes))
    sleep(1)
    print("Total number of comments: {}".format(total_comments))
    sleep(1)
    print("closing browser...")
    sleep(1)
    webdriver.close()
Exemplo n.º 19
0
def open_website_and_quit(website, browser, webdriver):
    sleep(3)
    try:
        webdriver.get(website)
        sleep(30)
        webdriver.close()
        sleep(2)
        browser.terminate()
    except TimeoutException as e:
        logging.warning(f"TIMEOUT {website}")
        sleep(90)
        webdriver.close()
        browser.kill()
    except Exception as e:
        webdriver.close()
        browser.kill()
        raise e
Exemplo n.º 20
0
import time
from selenium import webdriver

n = 0
while (n < 30):
    driver = webdriver.Chrome(executable_path="D:/appium\chromedriver.exe")
    driver.get("https://www.youtube.com/watch?v=Q-sgJ3xMJmg")
    time.sleep(10)
    n = n + 1
    if (n > 30):
        webdriver.close()
Exemplo n.º 21
0
 def cleanup_webdriver(self, webdriver=None):
     try:
         webdriver.quit()
         webdriver.close()
     except Exception:
         pass
Exemplo n.º 22
0
def quit_driver(webdriver):
    webdriver.close()
    webdriver.quit()
Exemplo n.º 23
0
def user7():
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from time import sleep
    from user8 import user8

    # Opens the Chrome browser and go to the login Instagram page.
    driverPath = 'D:\Programming\Python Projects\Personal Projects\chromedriver.exe'
    webdriver = webdriver.Chrome(executable_path=driverPath)
    webdriver.get(
        'https://www.instagram.com/accounts/login/?source=auth_switcher')
    sleep(2)

    # Holds the user's username and password that will be used.
    username = ''
    password = ''

    # Find the username and password elements and fills them in with the user's login.
    userElement = webdriver.find_element_by_name('username')
    userElement.send_keys(username)
    passElement = webdriver.find_element_by_name('password')
    passElement.send_keys(password)

    # Find the login button element and click it to login into the user's account.
    login = webdriver.find_element_by_css_selector(
        '#react-root > section > main > div > article > div > div:nth-child(1) > div > form > div:nth-child(4) > button > div'
    )
    login.click()
    sleep(3)

    # Find the "Not Now" element of the notification popup and click it to make it go away.
    notNow = webdriver.find_element_by_css_selector(
        'body > div.RnEpo.Yx5HN > div > div > div.mt3GC > button.aOOlW.HoLwm')
    notNow.click()
    sleep(1)

    # Direct the browser to Blanson's Chick-fil-a page.
    webdriver.get('https://www.instagram.com/p/B2epau2FUiI/')
    sleep(1)

    # Find the comment box on the page and click on it.
    commentBox = webdriver.find_element_by_css_selector(
        '#react-root > section > main > div > div > article > div.eo2As > section.sH9wk._JgwE > div > form > textarea'
    )
    commentBox.click()

    # This will be the comment that will be posted.
    comment = 'Blanson Bold! Blanson Gold!'

    # Comment infinitely (will be stopped inevitably by Instagram however).
    while True:
        # Find the comment box again to let the program know we are working with it again.
        commentBox = webdriver.find_element_by_css_selector(
            '#react-root > section > main > div > div > article > div.eo2As > section.sH9wk._JgwE > div > form > textarea'
        )
        # Input the comment in the comment box.
        commentBox.send_keys(comment)
        # Enter to post the comment.
        commentBox.send_keys(Keys.ENTER)
        sleep(.5)
        # Try to scan the page for the popup that blocks the user from commenting.
        try:
            webdriver.find_element_by_css_selector(
                'body > div.Z2m7o > div > div > div > p')
            sleep(2)
            # If it gets to this point then it has blocked the user, and we close this browser.
            webdriver.close()
            # Call the next function to start another browser and user.
            user8()
        # If it is not there, then it will cause an error and we will let the program run normally.
        except:
            pass
        # Wait 7 seconds to give the comment time to be uploaded.
        sleep(7)
Exemplo n.º 24
0
import unittest
import selenium.webdriver as driver
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome("C://chromedriver.exe")
driver.get("http://www.google.ru")
xzcdxgfd = driver.find_element_by_name("q")
xzcdxgfd.send_keys("kek")
time.sleep(5)
driver.close() hui
Exemplo n.º 25
0
def exchange_walutomat(username, password, transaction_type, first_currency, second_currency, amount, rate):
    webdriver.implicitly_wait(10)
    webdriver.get('https://panel.walutomat.pl/moj-walutomat')
    webdriver.find_element_by_id('username').send_keys(username)
    webdriver.find_element_by_id('password').send_keys(password)
    webdriver.find_element_by_class_name('bem-button__inner-text').click()
    time.sleep(5)
    webdriver.get('https://user.walutomat.pl/#/order-placement')
    element = webdriver.find_element_by_id('order-volume')
    element.clear()
    element.send_keys(str(amount))     #send amount
    time.sleep(3)


    #TODO: choose transaction type from a dropdown menu. Buy is by default.
    '''            
    webdriver.find_element_by_id('order-type').click()       #click on buy/sell
    time.sleep(2)
    
    # element from a dropdown menu is wrongly selected. To be fixed 
    if transaction_type == 'buy':       #choose buy/sell
        webdriver.find_element_by_class_name('select2-results__option select2-results__option--highlighted')
    elif transaction_type == 'sell':
        webdriver.find_element_by_link_text('Chcę sprzedać')
    '''

    #TODO: find a way to select element for a different currencies. USD/PLN is by default.
    # element selector from a dropdown menu doesn't work
    '''
    element.send_keys(Keys.TAB, Keys.SPACE)      #click to choose first currency
    time.sleep(2)
    webdriver.find_element_by_class_name('icon-{}'.format(first_currency)).click()        #choose first currency
    time.sleep(2)
    webdriver.send_keys(Keys.TAB)      #click on second currency
    time.sleep(2)
    webdriver.send_keys(Keys.SPACE)
    webdriver.find_element_by_class_name('icon-{}'.format(second_currency)).click()     #choose second currency
    time.sleep(2)
    webdriver.find_element_by_id('price-type-fixed').click()       #choose custom exchange rate
    time.sleep(2)
    '''

    webdriver.find_element_by_id('order-at-price').send_keys(str(rate))     #send custom exchange rate
    time.sleep(3)
    webdriver.find_element_by_id('order-preliminary-submit').click()        #confirm transaction parameters
    time.sleep(3)
    element = webdriver.find_elements_by_class_name('content')
    podsumowanie = element[3].text.split('\n')
    podsumowanie = '{}, kurs {} {}\n{}\n'.format(' '.join(podsumowanie[1:3]), podsumowanie[4].lower(), podsumowanie[5], ' '.join(podsumowanie[6:8]))
    print(podsumowanie)
    confirmation = input('Czy potwierdzasz?')
    if confirmation in ['T', 't', 'Tak', 'tak', 'Y', 'y', 'Yes', 'yes']:
        try:
            webdriver.find_element_by_id('confirm-exchange').click()
            print('Zlecenie zostało złożone.')
        except:
            'Something goes wrong. Laaambaada!'
    else:
        print('Operacja anulowana.')
    webdriver.close()
    return
Exemplo n.º 26
0
def close_up(request):
    webdriver = Firefox()

    user_link = request.user.profile.link

    # list need for reverse
    reversed_list = []
    with open(f'files_of_users/links_of_books_{user_link}.txt', 'r', encoding='utf-8') as f:
        if not os.path.exists(f'files_of_users/list_of_books_{user_link}.txt'):
            open(f'files_of_users/list_of_books_{user_link}.txt', 'w', encoding='utf 8').close()
        with open (f'files_of_users/list_of_books_{user_link}.txt', 'r', encoding='utf 8') as d:
            list_of_books = d.read()
            # there is need reverse, because new books go to the link list first, not last
            for link in f: reversed_list.append(link)
            for link in reversed(reversed_list):
                link = link.replace('\n', '')
                if link not in list_of_books:
                    r = webdriver.request('GET', link)
                    soup = BeautifulSoup(r.content, 'lxml')

                    overview = [link]
                  
                    book = soup.find('div', class_='block-border card-block')
                    author = []
                    if book.find('h2', class_='author-name unreg'):
                        authors = book.find('h2', class_='author-name unreg')
                        names = authors.find_all('a')    
                        for name in names:
                            author.append(name.text)
                        overview.append(author)
                    else:
                        author.append('Сборник')
                        overview.append(author)
                    title = book.span.text
                    overview.append(title)
                    tags = book.find_all('a', class_='label-genre')
                    list_of_tags = []
                    for tag in tags:
                        if tag.text.startswith('№'):
                            tag = tag.text.split('в\xa0')[1]
                            list_of_tags.append(tag)
                        else:
                            list_of_tags.append(tag.text)
                    overview.append(list_of_tags)
                    cover = book.find('img', id='main-image-book')['src']
                    overview.append(cover)
                    if book.find('span', itemprop='ratingValue'):
                        rating = book.find('span', itemprop='ratingValue').text
                    else:
                        rating = 0
                    overview.append(rating)
                    description = book.p.text
                    overview.append(description)

                    data = []
                    if os.stat(f'files_of_users/list_of_books_{user_link}.txt').st_size != 0:
                        with open(f'files_of_users/list_of_books_{user_link}.txt', 'r') as f:
                            old = json.load(f)
                            for i in old:
                                data.append(i)

                    data.append(overview)
                    with open(f'files_of_users/list_of_books_{user_link}.txt', 'w') as f:
                        json.dump(data, f)

    webdriver.close()
    return render(request, 'liv/test.html')
Exemplo n.º 27
0
def fill_forms(email_producer, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket, failfile, furl):
    """Finds a newsletter form on the page. If not found, visits <num_links>
    internal links and scans those pages for a form. Submits the form if found.
    """
    # skipping: load the site
    # skipping: connecting to logger

    # try to find a newsletter form on the landing page
    if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                           browser_params, manager_params):
        return

    # otherwise, scan more pages
    print("couldn't find form, going to click around")
    main_handle = webdriver.current_window_handle
    visited_links = set()
    for i in range(num_links):
        # get all links on the page
        links = webdriver.find_elements_by_tag_name('a')
        random.shuffle(links)

        current_url = webdriver.current_url
        current_ps1 = domain_utils.get_ps_plus_1(current_url)

        # find links to click
        match_links = []
        start_time = timeit.default_timer()
        for link in links:
            try:
                if not link.is_displayed():
                    continue

                # check if link is valid and not already visited
                href = link.get_attribute('href')
                if href is None or href in visited_links:
                    continue

                # check if this is an internal link
                if not _is_internal_link(href, current_url, current_ps1):
                    continue

                link_text = link.text.lower()

                # skip links with blacklisted text
                blacklisted = False
                for bl_text in _LINK_TEXT_BLACKLIST:
                    if bl_text in link_text:
                        blacklisted = True
                        break
                if blacklisted:
                    continue

                # should we click this link?
                link_rank = 0
                for type, s, rank, flags in _LINK_TEXT_RANK:
                    if (type == _TYPE_TEXT
                            and s in link_text) or (type == _TYPE_HREF
                                                    and s in href):
                        if flags & _FLAG_IN_NEW_URL_ONLY:
                            # don't use this link if the current page URL already matches too
                            if type == _TYPE_HREF and s in current_url:
                                continue

                        # link matches!
                        link_rank = rank
                        match_links.append(
                            (link, rank, link_text, href, flags))
                        break
                if link_rank >= _LINK_RANK_SKIP:  # good enough, stop looking
                    break
            except:
                print("ERROR while looping through links...")
                sys.exit(1)

            # quit if too much time passed (for some reason, this is really slow...)
            if match_links and timeit.default_timer(
            ) - start_time > _LINK_MATCH_TIMEOUT:
                break

        # find the best link to click
        if not match_links:
            break  # no more links to click
        match_links.sort(key=lambda l: l[1])
        next_link = match_links[-1]
        visited_links.add(next_link[3])

        # click the link
        try:
            # load the page
            print("clicking on link '%s' - %s" % (next_link[2], next_link[3]))
            next_link[0].click()
            time.sleep(_PAGE_LOAD_TIME)
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)
            # if browser_params['bot_mitigation']:
            #     bot_mitigation(webdriver)

            # find newsletter form
            if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                                   browser_params, manager_params):
                return

            # should we stay on this page?
            if next_link[4] & _FLAG_STAY_ON_PAGE:
                continue

            # go back
            webdriver.back()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

            # check other windows (ex. pop-ups)
            windows = webdriver.window_handles
            if len(windows) > 1:
                form_found_in_popup = False
                for window in windows:
                    if window != main_handle:
                        webdriver.switch_to_window(window)
                        wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

                        # find newsletter form
                        if _find_and_fill_form(webdriver, email_producer,
                                               visit_id, debug, browser_params,
                                               manager_params):
                            form_found_in_popup = True

                        webdriver.close()
                webdriver.switch_to_window(main_handle)
                time.sleep(1)

                if form_found_in_popup:
                    return
        except:
            pass

    # if you reach here, signup wasn't successful -- save the information
    with open(failfile, 'a') as wh:
        wh.write(furl + '\n')
Exemplo n.º 28
0
def _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                        browser_params, manager_params):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    # try to find newsletter form on landing page
    newsletter_form = _find_newsletter_form(webdriver)
    if newsletter_form is None:
        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name('iframe')
        for iframe in iframes:
            # switch to the iframe
            webdriver.switch_to_frame(iframe)

            # is there a form?
            newsletter_form = _find_newsletter_form(webdriver)
            if newsletter_form is not None:
                if debug:
                    dump_page_source(debug_page_source_initial, webdriver,
                                     _SRC_DUMP_PATH)
                in_iframe = True
                break  # form found, stay on the iframe

            # switch back
            webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH)

    email = email_producer(current_url, current_site_title)
    user_info = _get_user_info(email)
    _form_fill_and_submit(newsletter_form, user_info, webdriver, False,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    print('submitted form on [%s] with email [%s]' % (current_url, email))
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)
    # if debug: save_screenshot(debug_form_post_initial, webdriver, browser_params, manager_params)

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(webdriver)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_initial,
                                             webdriver, _SRC_DUMP_PATH)
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)
                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        follow_up_form = _find_newsletter_form(webdriver)
        if follow_up_form is not None:
            if debug:
                dump_page_source(debug_page_source_initial, webdriver,
                                 _SRC_DUMP_PATH)
            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)
            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params)

# switch back
    if in_iframe:
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
Exemplo n.º 29
0
def tearDownBrowser(browser, webdriver):
    print("-------------------------------------")
    print("Run " + browser + " Completed at :" + str(datetime.datetime.now()))
    webdriver.close()
    webdriver.quit()
Exemplo n.º 30
0
 def close_browser(cls, webdriver):
     '''Closes the browser after a delay. This method will be called in data_parser.py'''
     WebDriverWait(webdriver, delay)
     webdriver.close()
def retrieve_image(search_query, webdriver, dir_name, img_name):

    try:

        logger.log("image_scraping function start")
        image_name = ''

        # Variable that holds the number of images to fetch
        number_of_images_to_fetch = 1
        index = 0

        # Scroll down the webpage to load more images
        scroll_down(webdriver)

        time.sleep(5)

        # Save all of the html image elements from our google search
        # 'rg_i' is the class name that the images have
        image_elements = webdriver.find_elements_by_class_name('rg_i')

        target_dir = basest_dir + "/" + dir_name

        # Check if the directory that we want to put our iamges in already exists
        if not os.path.exists(target_dir):

            # If not, make that directory
            os.mkdir(target_dir)

        found_image_count = 0
        attempt_count = 0
        logger.log("begin finding images")
        for element in image_elements:

            attempt_count += 1

            try:

                # Check if you've downloaded all the images you want
                if found_image_count == number_of_images_to_fetch:
                    break

                # Click on the image you want to download
                element.click()

                # Give the browser some time to catch up
                time.sleep(2)

                # After clicking on the image, get the larger version
                found_image = webdriver.find_element_by_class_name('n3VNCb')

                # find the source of the image, it's url
                image_url = found_image.get_attribute('src')

                logger.log("attempt " + str(attempt_count) + ": " +
                           image_url[0:10])

                # Make sure that the image url is a valid source
                if 'http' in image_url:

                    logger.log("successful image found")

                    # Download this image as a BytesIO object
                    image_file = io.BytesIO(requests.get(image_url).content)

                    # Convert our BytesIO object into an actual image
                    image = Image.open(image_file).convert('RGB')

                    # Create the the name of this image we're downloaded
                    image_name = img_name + '.jpg'

                    logger.log(image_name)

                    # Save the path that we want to save the image to
                    # The directory will be the same name as the search query
                    image_path = target_dir + '/' + image_name

                    # Save the image
                    image.save(image_path, 'JPEG', quality=85)

                    found_image_count += 1

                # endif statement

            # end try block

            except:
                logger.log("couldn't find enhanced images")

            # end except block

        # End for loop  loop

        # close the web browser
        #webdriver.close()

        if attempt_count > 3:
            logger.log("multiple attempts: " + search_query + "<=======")

        else:
            logger.log(image_name)
        return image_name

    except:
        logger.log("retrieve image crash")
        webdriver.close()
Exemplo n.º 32
0
def close_up(request):
    print('start close_up')
    webdriver = Firefox()

    userlink = request.user.profile.link

    # список для реверса
    ll = []
    with open(f'files_of_users/links_of_books_{userlink}.txt',
              'r',
              encoding='utf-8') as f:
        if not os.path.exists(f'files_of_users/list_of_books_{userlink}.txt'):
            open(f'files_of_users/list_of_books_{userlink}.txt',
                 'w',
                 encoding='utf 8').close()
        with open(f'files_of_users/list_of_books_{userlink}.txt',
                  'r',
                  encoding='utf 8') as d:
            list_of_books = d.read()
            # нужен реверс, т.к. в список ссылок новые книги идут первыми, а не последними
            for link in f:
                ll.append(link)
            for link in reversed(ll):
                link = link.replace('\n', '')
                print('\n', link)
                if link not in list_of_books:
                    print('Обрабатывается', link)
                    # sleep против капчи
                    time.sleep(5)

                    r = webdriver.request('GET', link)
                    soup = BeautifulSoup(r.content, 'lxml')

                    # для обработки ошибок
                    with open('files_of_users/current_book.txt',
                              'w',
                              encoding='utf-8') as f:
                        f.write(soup.prettify())

                    overview = [link]

                    book = soup.find('div', class_='block-border card-block')
                    author = []
                    if book.find('h2', class_='author-name unreg'):
                        authors = book.find('h2', class_='author-name unreg')
                        names = authors.find_all('a')
                        for name in names:
                            author.append(name.text)
                        overview.append(author)
                    else:
                        author.append('Сборник')
                        overview.append(author)
                    title = book.span.text
                    overview.append(title)
                    tags = book.find_all('a', class_='label-genre')
                    list_of_tags = []
                    for tag in tags:
                        if tag.text.startswith('№'):
                            tag = tag.text.split('в\xa0')[1]
                            list_of_tags.append(tag)
                        else:
                            list_of_tags.append(tag.text)
                    overview.append(list_of_tags)
                    cover = book.find('img', id='main-image-book')['src']
                    overview.append(cover)
                    if book.find('span', itemprop='ratingValue'):
                        rating = book.find('span', itemprop='ratingValue').text
                    else:
                        rating = 0
                    overview.append(rating)
                    description = book.p.text
                    overview.append(description)

                    data = []
                    if os.stat(f'files_of_users/list_of_books_{userlink}.txt'
                               ).st_size != 0:
                        with open(
                                f'files_of_users/list_of_books_{userlink}.txt',
                                'r') as f:
                            old = json.load(f)
                            for i in old:
                                data.append(i)

                    data.append(overview)
                    with open(f'files_of_users/list_of_books_{userlink}.txt',
                              'w') as f:
                        json.dump(data, f)
                    print('Обработана')

                else:
                    print('Уже обработана', link)

    webdriver.close()
    print('finish close_up')
    return render(request, 'liv/test.html')
Exemplo n.º 33
0
    def crawler(self):
        url = "https://www.instagram.com/explore/tags/무신사/"
        # 포스트 내 컨텐츠 담을 리스트 선언
        tagList = [] 
        # 페이지 스크롤 변수
        pagedowns = 0
        # dict(hashtag,cnt)
        hashtag = {}
        # 엑셀 저장 데이터
        feedList = []
        # 리턴 데이터
        returnList = {}
        # 크롤링 결과 데이터
        crawlingList = {}
        # 크롬 옵션 설정
        # options = webdriver.ChromeOptions()
        # print(options)
        # #headless 모드 
        # options.add_argument('headless')
        # options.add_argument('window-size=1920x1080')
        # options.add_argument('disable-gpu')
        # #headless 모드 탐지 방지 언어 및 headless로 보이지 않도록 플러그인 수정
        # options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
        # options.add_argument("lang=ko_KR") # 한국어!
        # print(options)
        # driver = webdriver.Chrome('chromedriver',chrome_options=options)
        #네비게이터에 올바른 브라우저 환경처럼 보이도록 세팅해준다
        #driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
        #언어
        #driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})")
        #위에서 차단한 렌더링 가속 가짜로 넣어서 위장
        #driver.execute_script("const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};")
        # 브라우저가 실행되며 해당 url로 이동

        # 파이어폭스 옵션 설정
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.socks", "127.0.0.1")
        profile.set_preference("network.proxy.type", 9150)
        profile.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0')
        profile.update_preferences()

        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")

        try:
            driver = webdriver.Firefox(executable_path='/crawler/repo/blog/geckodriver.exe',firefox_profile=profile,firefox_options=options)
        except WebDriverException:
            webdriver.close()

        #코드 시작시간
        start = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S")
        print(start)

        driver.get(url)
        # 웹자원 대기
        driver.implicitly_wait(1) 
        # 총 게시물 수 태그 클래스이름으로 찾기
        ttlFeed = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"g47SY"))) 
        print("총 게시물:", ttlFeed.text)
        # body 태그를 태그 이름으로 찾기
        time.sleep(1)
        # 페이지 내 첫번째 게시물 클릭
        driver.find_elements_by_class_name("eLAPa")[0].click()
        # failCnt
        failCnt = 0
        count = self.count
        # 데이터 스크래핑 시작
        while pagedowns < count:
                # 페이지 호출 후 대기
                #driver.implicitly_wait(5) 

            #게시물 본문
            try:
                post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME,"C4VMK")))

                try:
                    driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME)     
                    driver.find_element_by_class_name('XQXOT').find_element_by_xpath("//ul/li/div/button").click()
                    driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME)  

                except (NoSuchElementException,ElementNotInteractableException):
                    pass
                
                # 게시물 글자수 160자
                # 댓글포함 최대 30개
                # 하나의 해시트그 내 글자수 100자

                #id = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('a')[0].innerText")
                
                #content = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('span')[0].innerText")
                
                req = driver.page_source
                soup = BeautifulSoup(req,'html.parser')
                replyCount = soup.find_all("div",class_="C4VMK")
                tagCount = replyCount[0].select('span>a')
                id = replyCount[0].find_all(class_="_6lAjh")[0].select("a")[0].text
                content = replyCount[0].select('span')[0].text
                like = '0'
                tags=[]  
                feedRow = {}        

                try:
                    #like = driver.find_element_by_class_name("Nm9Fw").find_element_by_tag_name("span").text
                    like = soup.find_all("div",class_="Nm9Fw")[0].select("span")[0].text

                except (NoSuchElementException,IndexError):
                    try:
                        like = soup.find_all("span",class_="vcOH2")[0].select("span")[0].text
                    except IndexError:
                        pass
            
                #데이터 가공
                emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)

                content = emoji_pattern.sub('',content)
                #태그 로직 끝난 후에 긍정,부정 체크 메서드 만들것        
                
                #본문의 해시태그
                if len(tagCount) > 0:

                    for i in range(0,len(tagCount)):
                        tag = tagCount[i].text

                        if "#" in tag:
                            tag = tag.replace("#","").replace(" ","")
                            tags.append(tag)

                #댓글의 해시태그
                if len(replyCount) > 0:

                    for i in range(1,len(replyCount)):
                        #replyid = "document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('a')[0].innerText"
                        replyid = replyCount[i].find_all("a")[0].text

                        if id == replyid:
                            #replyTagCount = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a').length")
                            replyTagCount = replyCount[i].find_all("a")

                            if len(replyCount) > 1:

                                for j in range(0,len(replyTagCount)):
                                    #reply =  driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a')["+j+"].innerText")
                                    reply = replyTagCount[j].text

                                    if "#" in reply:
                                        reply = reply.replace("#","").replace(" ","")
                                        tags.append(reply)
                
                #중복제거
                tags = list(set(tags))
                tagList.append(tags)
                print("=======================================================================================")
                print("====================================pagedowns : ",pagedowns,"====================================")
                print("=======================================================================================")
                print("id===============================",id)
                print("content==========================",content)
                print("like=============================",like)
                print("finaltag=========================",tags)
                feedRow["id"] = id
                feedRow["content"] = content
                feedRow["tag"] = tags
                feedRow["like"] = like
                feedList.append(feedRow)

                time.sleep(1)           
                
                #다음 게시물 클릭
                try:
                    driver.find_element_by_class_name("HBoOv").click()

                except NoSuchElementException:
                    # 웹자원 대기
                    driver.get(url)
                    driver.implicitly_wait(1) 

                    for i in range(0,pagedowns):
                        driver.find_elements_by_class_name("eLAPa")[0].click()
                        #html = driver.find_element_by_tag_name("html")
                        #html.send_keys(Keys.DOWN)
                
                pagedowns += 1
                print("=======================================================================================")
                print("=======================================================================================")
            except (NoSuchElementException,StaleElementReferenceException,TimeoutException):
                failCnt += 1
                print("=======================================================================================")
                print("====================================failcount : ",failCnt,"=====================================")
                print("=======================================================================================")
                if failCnt > 3:
                    driver.find_element_by_class_name("HBoOv").click()
                    
                time.sleep(120)
                pass
                
        print("끝!!")
                
        # 해시태그 중복 검사 후 리스트로 재할당
        tagList = list([tuple(set(tag)) for tag in tagList])

        # 해시태그 갯수 구하기
        for htags in tagList:
            for htag in htags:
                # 해시태그 카운트 업
                if not (htag in hashtag):
                    hashtag[htag] = 1
                else:
                    hashtag[htag] += 1

        # 정렬
        keys = sorted(hashtag.items(), key = lambda x:x[1], reverse = True)

        # n순위 까지 출력
        for k, v in keys[:15]:
            print("{}({})".format(k, v))

        end = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S")

        print("start======",start)
        print("end======",end)

        print("enddivision=========",datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S"))
        # result = pd.DataFrame(feedList)
        # result.columns = ['id','content','tag','like']
        # result.head()

        #웹자원 종료
        driver.close
        
        crawlingList["ttlfeed"] = ttlFeed.text
        crawlingList["crwfeed"] = len(tagList)
        crawlingList["succnt"] = pagedowns
        crawlingList["failcnt"] = failCnt
        crawlingList["created_at"] = start
        crawlingList["updated_at"] = end
        crawlingList["working_while"] = str(datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S"))

        returnList["crawlingList"] = crawlingList
        returnList["tagList"] = keys
        returnList["excelList"] = feedList

        return returnList