예제 #1
0
파일: app.py 프로젝트: sure79/AWS_coupang
    def makecafe24Modify():
        options = Options()
        options = webdriver.ChromeOptions()
        # options.add_argument('headless')
        driver = webdriver.Chrome(chromedriver_path, chrome_options=options)
        # label.config(text="btn2, Clicked!")
        href = "https://eclogin.cafe24.com/Shop/"
        # chromedriver_path = 'C:/python-program/webcrawling_1230-r2/chromedriver'
        # driver = webdriver.Chrome(chromedriver_path)
        driver.get(href)
        time.sleep(2)

        # 카페24 아이디 / 비번 입력
        driver.find_element_by_id('mall_id').send_keys('kkwjkd')
        driver.find_element_by_id('userpasswd').send_keys('zmffldh123')
        driver.find_element_by_class_name('btnSubmit').click()
        time.sleep(7)

        yield "data:10\n\n"
        #팝업삭제
        try:
            driver.find_element_by_xpath(
                '//*[@id="admngLayerWrapper29"]/form/div/button').click()
            time.sleep(2)
        except:
            pass

        #상품관리 버튼클릭
        driver.find_element_by_xpath('//*[@id="QA_Gnb_product2"]').click()
        time.sleep(2)

        #재고관리 버튼클릭
        driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2049"]').click()
        time.sleep(2)

        driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2050"]').click()
        time.sleep(2)

        #카테고리 화장품으로 수정
        driver.find_element_by_xpath('//*[@id="eCategory1"]/option[2]').click()
        time.sleep(2)

        #하위분류 포함검색 체크
        driver.find_element_by_xpath(
            '//*[@id="submitSearchBox"]/table/tbody/tr[2]/td/div/span/label[1]/input'
        ).click()
        time.sleep(1)

        #상세검색열기 클릭
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[4]/div/span/button').click()
        time.sleep(1)

        #재고관리 사용안함 체크
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[3]/input'
        ).click()
        time.sleep(1)

        #100개씩 보기선택
        driver.find_element_by_xpath(
            '//*[@id="QA_list2"]/div[2]/div[2]/select[2]/option[5]').click()
        time.sleep(2)

        #검색버튼 클릭
        driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click()
        time.sleep(2)

        yield "data:30\n\n"
        #사용안함을 사용함,품절사용체크 반복문
        i = 1
        while True:
            try:

                # paging = driver.find_element_by_xpath('//*[@id="QA_list2"]/div[6]/ol/li['+str(i)+']')
                # print(paging.text)

                #체크박스 체크
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[5]/input'
                ).click()

                #재고관리 일괄설정클릭
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[3]/div[1]/a[1]').click()

                #팝업창 열리고 사용함 선택
                driver.find_element_by_xpath(
                    '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[1]/select/option[1]'
                ).click()

                #품절가능 체크
                driver.find_element_by_xpath(
                    '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[6]/input'
                ).click()

                #선택완료버튼 클릭
                driver.find_element_by_xpath(
                    '//*[@id="layerBatchSet"]/div[2]/a[1]').click()
                time.sleep(2)
                #팝업창 완료
                driver.switch_to_alert().accept()
                time.sleep(2)

                #팝업창 문자확인
                # poptext = driver.switch_to_alert().text
                # if poptext == "처리할 품목이 없습니다.":
                #     break

                #팝업창 완료
                driver.switch_to_alert().accept()
                time.sleep(3)

                # driver.switch_to_window
                # time.sleep(3)
                continue
                # i += 1
                # if i == 11:
                #     driver.find_element_by_css_selector('#QA_list2 > div.mPaginate > a.next').click()
                #     i = 1
                #     continue

            except:
                break

        # 사용함으로 모두 수정후 닫기 클릭
        driver.find_element_by_xpath(
            '//*[@id="layerBatchSet"]/div[2]/a[2]/span').click()
        time.sleep(3)

        #상세검색창에 재고 사용함 체크
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[2]/input'
        ).click()
        time.sleep(1)

        #재고 0부터
        driver.find_element_by_class_name('fText.right.eSearchText').send_keys(
            '0')
        time.sleep(1)

        #재고 0까지입력
        driver.find_element_by_css_selector(
            '#eSearchFormStock > li > input:nth-child(3)').send_keys('0')
        time.sleep(1)

        #판매상태 판매함 체크
        driver.find_element_by_xpath(
            '//*[@id="submitSearchBox"]/table/tbody/tr[4]/td[2]/label[2]/input'
        ).click()
        time.sleep(1)

        #진열상태 진열함 체크
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[3]/table/tbody/tr[5]/td[1]/label[2]/input'
        ).click()
        time.sleep(1)

        #검색버튼 클릭
        driver.find_element_by_xpath('//*[@id="eBtnSearch"]').click()

        yield "data:60\n\n"
        #재고 0 상품을 재고관리 사용안함으로 수정 반복문
        while True:
            try:
                #체크버튼 전체선택
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[5]/input'
                ).click()
                time.sleep(2)

                #재고관리 일괄설정
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[3]/div[1]/a[1]').click()
                time.sleep(2)

                #재고관리 사용안함 체크
                driver.find_element_by_xpath(
                    '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[1]/select/option[2]'
                ).click()
                time.sleep(2)

                #진열여부체크
                driver.find_element_by_xpath(
                    '//*[@id="eManageStockBatchForm"]/table/tbody/tr[2]/td/div/table/tbody/tr[1]/th/label/input'
                ).click()
                time.sleep(1)

                #판매여부체크
                driver.find_element_by_xpath(
                    '//*[@id="eManageStockBatchForm"]/table/tbody/tr[2]/td/div/table/tbody/tr[2]/th/label/input'
                ).click()
                time.sleep(1)

                #확인 버튼 클릭
                driver.find_element_by_xpath(
                    '//*[@id="layerBatchSet"]/div[2]/a[1]/span').click()
                time.sleep(2)

                #팝업창 확인 클릭
                driver.switch_to_alert().accept()
                time.sleep(2)

                #팝업창 확인 클릭
                driver.switch_to_alert().accept()
                time.sleep(2)
                # driver.switch_to_window
                time.sleep(2)
                continue

            except:
                break

        # 사용안함으로 모두 수정후 닫기 클릭
        driver.find_element_by_xpath(
            '//*[@id="layerBatchSet"]/div[2]/a[2]/span').click()
        time.sleep(3)

        try:
            #팝업창 제거
            driver.find_element_by_xpath(
                '//*[@id="layerBatchSet"]/div[2]/a[2]').click()
            time.sleep(1)

        except:
            pass

        #재고0 상품 모두 품절 처리

        yield "data:80\n\n"
        #상품관리 메뉴클릭
        driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2036"]').click()
        time.sleep(1)

        #상품목록 메뉴클릭
        driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2037"]').click()
        time.sleep(1)

        #상세검색클릭
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[4]/div/span/button').click()
        time.sleep(2)

        #카테고리 화장품으로 수정
        driver.find_element_by_xpath('//*[@id="eCategory1"]/option[2]').click()
        time.sleep(2)

        #하위분류 포함검색 체크
        driver.find_element_by_xpath(
            '//*[@id="submitSearchBox"]/table/tbody/tr[3]/td/div/span/label[1]/input'
        ).click()
        time.sleep(1)

        #재고관리 사용안함 체크
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[3]/input'
        ).click()
        time.sleep(2)

        #판매상태 판매함으로 체크
        driver.find_element_by_xpath(
            '//*[@id="submitSearchBox"]/table/tbody/tr[5]/td[2]/label[2]/input'
        ).click()
        time.sleep(2)

        #상품 100개 열기
        driver.find_element_by_xpath(
            '//*[@id="QA_list2"]/div[2]/div[2]/select[2]/option[5]').click()
        time.sleep(2)

        #검색버튼 클릭
        driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click()
        time.sleep(2)

        #재고관리 사용안함, 판매함으로 되어 있는 상품들 모두 판매안함으로 수정

        while True:
            try:

                #상품 체크박스 선택
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[1]/input'
                ).click()
                time.sleep(2)

                #판매안함 버튼 클릭
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[3]/div[1]/a[4]/span').click()
                time.sleep(2)

                #팝업창 확인
                driver.switch_to_alert().accept()
                time.sleep(2)

                #팝업창 확인
                driver.switch_to_alert().accept()
                time.sleep(2)
                # driver.switch_to_window
                time.sleep(2)
                continue
            except:
                break

        time.sleep(3)
        try:
            driver.switch_to_alert().accept()
            time.sleep(2)

        except:
            pass

        #입고상품 판매함으로 수정

        #재고관리 전체로 선택
        driver.find_element_by_xpath(
            '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[1]/input'
        ).click()
        time.sleep(2)

        #판매상태 판매안함 선택
        driver.find_element_by_xpath(
            '//*[@id="submitSearchBox"]/table/tbody/tr[5]/td[2]/label[3]/input'
        ).click()
        time.sleep(2)

        #재고관리 재고 1입력
        driver.find_element_by_css_selector(
            '#eSearchFormStock > li > input:nth-child(2)').send_keys('1')
        time.sleep(2)

        #검색버튼 클릭
        driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click()
        time.sleep(4)

        yield "data:90\n\n"

        #판매안함 상품을 모두 판매함으로 수정
        while True:
            try:

                #체크박스 선택
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[1]/input'
                ).click()
                time.sleep(2)

                #판매함버튼 클릭
                driver.find_element_by_xpath(
                    '//*[@id="QA_list2"]/div[3]/div[1]/a[3]/span').click()
                time.sleep(2)

                #팝업창 확인
                driver.switch_to_alert().accept()
                time.sleep(2)

                #팝업창 확인
                driver.switch_to_alert().accept()
                time.sleep(2)
                driver.switch_to_window
                time.sleep(2)
                continue
            except:
                break

        driver.close()
        yield "data:100\n\n"
예제 #2
0
 def setUpClass(cls):
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('headless')
     chrome_options.add_argument('window-size=1920x1080')
     cls.driver = webdriver.Chrome(options=chrome_options)
예제 #3
0
def execute(args):
  """Run Javascript unit tests. Here are the steps:

     1. Execute the HTML with chromedriver.
     2. Read the test result from the HTML."""
  test_filepath = os.path.join('src', 'appengine', 'private', 'test.html')
  print('Running chromedriver on %s' % test_filepath)

  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--allow-file-access-from-files')

  is_ci = os.getenv('TEST_BOT_ENVIRONMENT')
  if is_ci:
    # Turn off sandbox since running under root, with trusted tests.
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--headless')

  driver = webdriver.Chrome(
      executable_path=common.get_chromedriver_path(),
      chrome_options=chrome_options)

  try:
    driver.get('file://%s' % os.path.abspath(test_filepath))

    # Wait for tests to be completed.
    while True:
      success_count = driver.execute_script(
          'return WCT._reporter.stats.passes;')
      failure_count = driver.execute_script(
          'return WCT._reporter.stats.failures;')
      sys.stdout.write(
          '\rSuccess: %d, Failure: %d' % (success_count, failure_count))
      sys.stdout.flush()

      is_complete = driver.execute_script('return WCT._reporter.complete;')
      if is_complete:
        break

      time.sleep(0.1)

    sys.stdout.write('\r' + (' ' * 70))
    sys.stdout.flush()

    success_count = int(
        driver.find_element_by_css_selector('#mocha-stats .passes em').text)
    failure_count = int(
        driver.find_element_by_css_selector('#mocha-stats .failures em').text)
    error_report = _parse_error_report(driver)

    if error_report:
      print(error_report)

    print()
    print(_SUITE_SEPARATOR)
    print('Test results:')
    print('| Success: %d' % success_count)
    print('| Failure: %d' % failure_count)
    print(_SUITE_SEPARATOR)
    print()

    if args.persist:
      # pylint: disable=eval-used
      eval(
          input('--persist is used. Leave the browser open.'
                ' Press ENTER to close it:'))
  finally:
    driver.quit()

  if failure_count > 0:
    sys.exit(1)
def test_reactbank(timeout_sec=2.0):
    """
    This runs the test.

    """

    driver = None

    try:

        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-extensions")

        driver = webdriver.Chrome(options=options)

        # get to the landing page
        driver.get(landingPageUrl)

        #
        # login
        #
        logger.info("[RUNNER] doing login")
        login_to_reactbank(driver, timeout_sec)
        logger.info("[RUNNER] login done")

        #
        # other tests go here
        #

        # balance check
        logger.info("[RUNNER] doing balance check")
        find_balance_on_userpage(driver, timeout_sec)
        logger.info("[RUNNER] balance check done")

        # help form fill-in
        logger.info("[RUNNER] doing help form")
        fill_out_help_form(driver, timeout_sec)
        logger.info("[RUNNER] help form done")

        #
        # logout
        #
        logger.info("[RUNNER] doing logout")
        logout_from_reactbank(driver, timeout_sec)
        logger.info("[RUNNER] logout done")

    except Exception:

        logger.exception("Ran into exception when running test.")
        raise

    finally:

        if driver is not None:

            logger.info("[RUNNER] waiting to close the window")
            time.sleep(2.5)
            driver.close()
            logger.info("[RUNNER] window close done, waiting for driver quit")
            driver.quit()
            logger.info("[RUNNER] driver quit done")

        logger.info("[RUNNER] test run complete")
예제 #5
0
def daily_task():
    global DATE
    DATE = str(datetime.date.today())
    chromeOptions = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images":2}
    # chromeOptions.add_argument("--disable-javascript")
    chromeOptions.add_experimental_option("prefs",prefs)
    chromeOptions.add_argument("--headless")
    chromeOptions.add_argument("start-maximized")
    chromeOptions.add_argument("disable-infobars")
    chromeOptions.add_argument("--disable-extensions")
    chromeOptions.add_argument("--no-sandbox")
    chromeOptions.add_argument("--disable-dev-shm-usage")
    browser2 = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH)
    # browser2 = webdriver.Chrome(chrome_options=chromeOptions)
    browser2.set_window_position(100, 40)
    browser2.set_window_size(1300, 1024)
    wait2 = ui.WebDriverWait(browser2,30)
    # browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH)
    browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH)
    browser.set_window_position(400, 40)
    browser.set_window_size(1300, 1024)
    wait = ui.WebDriverWait(browser,30)
    browser.get(BASE_URL)
    urls = []
    titles = []
    wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[2]/nav/div/div[3]'))
    soup = BeautifulSoup(browser.page_source, 'lxml')
    category_list = soup.find('nav', class_='white').find('div', class_='top-cate').find_all('a')
    c=0
    for item in category_list:
        if c==0 :
            c+=1
            continue
        href = BASE_URL + item.get('href')
        title = item.text.strip()
        if href not in urls:
            urls.append(href)
            titles.append(title)
        c+=1
    # print(len(category_list))
    # print(category_list)
    # print(len(urls))
    # print(urls)
    write_html(browser.page_source, "All_cat_")
    j=0
    while j < len(urls):
        sys.stdout.write('Scraping ' + urls[j] + ' ...' + ' '*10)
        browser.get(urls[j])
        wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]'))
        soup = BeautifulSoup(browser.page_source, 'lxml')

        category = titles[j]

        i=0
        pagination = True
        while pagination:
            soup = BeautifulSoup(browser.page_source, 'lxml')
            if i != 0:
                try:
                    wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]'))
                    element = browser.find_element_by_css_selector('#list-page > div.container-list-restaurant.clearfix.active-view-column > div.pagation.clearfix > a.ico-page.ico-page-next.ng-scope')
                    if element.is_displayed():
                        browser.execute_script("arguments[0].click();", element)
                        time.sleep(3)
                    else:
                        pagination = False
                    wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]'))
                    soup = BeautifulSoup(browser.page_source, 'lxml')
                    list = soup.find('div', id='list-page').find_all('div', class_='view-column-list')
                except NoSuchElementException:
                    pagination = False
                except TimeoutException:
                    pagination = False
                except:
                    pagination = False
            if i == 0:
                soup = BeautifulSoup(browser.page_source, 'lxml')
                list = soup.find('div', id='list-page').find_all('div', class_='view-column-list')
            if pagination == False:
                break
            # print(len(list))
            # print(i+1)
            file_name = str(j+1) + "_" + str(i+1) + "_"
            write_html(browser.page_source, file_name)
            for item in list:
                # if item.find('div', class_='ct_title') != None:
                #     title = item.find('div', class_='ct_title').text.strip()
                # else:
                #     title = None
                try:
                    href = BASE_URL + item.find('a').get('href')
                    browser2.get(href)
                    # wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="right"]/div[1]'))
                    soup = BeautifulSoup(browser2.page_source, 'lxml')
                except TimeoutException:
                    continue
                except:
                    continue

                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant') != None:
                        food_category = soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').text.strip()
                        if soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').find('a') != None:
                            txt = soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').find('a').text.strip()
                            food_category = food_category.replace(txt, '')
                            food_category = food_category.strip()
                    else:
                        food_category = None
                except:
                    food_category = None

                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('h1', class_='name-hot-restaurant') != None:
                        seller = soup.find('div', class_='info-basic-hot-restaurant').find('h1', class_='name-hot-restaurant').text.strip()
                    else:
                        seller = None
                except:
                    seller = None

                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None:
                        location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip()
                    else:
                        location = None
                except:
                    location = None


                try:
                    if soup.find('div', class_='slick-list').find('span', class_='font14') != None:
                        delivery_fee = soup.find('div', class_='slick-list').find('span', class_='font14').text.strip()
                        delivery_fee = delivery_fee.replace('[?]','')
                    else:
                        delivery_fee = None
                except:
                    delivery_fee = None


                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None:
                        location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip()
                    else:
                        location = None
                except:
                    location = None


                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None:
                        location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip()
                    else:
                        location = None
                except:
                    location = None


                try:
                    if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None:
                        location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip()
                    else:
                        location = None
                except:
                    location = None


                # 555555555555555---location,
                # 555555555555555---seller,
                # 555555555555555---delivery fee,
                # ---food name,
                # ---food price,
                # ---food old_price (previous price if exists),
                # ---food orders,
                # 111111---food type,
                # 55555555555---food category,
                # 55555555555---category (name of category),
                # 55555555555---current date
                try:
                    products_types = soup.find('div', class_='detail-menu-kind').find_all('div', class_='scrollspy')
                except:
                    continue
                # print(products_types)
                for products_type in products_types:
                    # print(products_type)
                    food_type = products_type.find('h2', class_='title-kind-food').text.strip()
                    products = products_type.find_all('div', class_='box-menu-detail')
                    for product in products:
                        try:
                            food_name = product.find('h3').text.strip()
                        except:
                            continue
                        try:
                            food_orders = product.find('div', class_='name-food-detail').find('p', class_='light-grey').text.strip()
                        except:
                            continue
                        try:
                            food_price = product.find('div', class_='product-price').find('p', class_='current-price').text.strip()
                        except:
                            continue
                        try:
                            old_price = product.find('div', class_='product-price').find('p', class_='old-price').text.strip()
                        except:
                            old_price = None
                        data = {'category': category,
                                'food_category': food_category,
                                'location': location,
                                'seller': seller,
                                'delivery_fee': delivery_fee,
                                'food_type': food_type,
                                'food_name': food_name,
                                'food_orders': food_orders,
                                'food_price': food_price,
                                'old_price': old_price,
                                'date': DATE}
                        write_csv(data)
            i+=1
        j+=1
    # Close browser
    browser.close()
    browser.service.process.send_signal(signal.SIGTERM)
    browser.quit()
    compress_data()
예제 #6
0
def get_data(url, write_file='data_temp.json', write_type='a'):

    data_label = '城市等级划分'
    data_city_nums = 404
    data_city_per_page = 50
    data_sub_label1 = '设备类型分布'
    data_sub_label2 = '场景分布'

    cities_data = []

    options = webdriver.ChromeOptions()
    options.add_argument('headless')

    user_data_path = get_usr_data_dir()
    options.add_argument(user_data_path)
    driver = webdriver.Chrome(options=options)

    for index in range(400, 404):
        page_index = int(index / data_city_per_page) + 1
        index_in_pages = (index % data_city_per_page) + 1

        city = City()

        driver.get(url)
        time.sleep(1)

        if (index_in_pages == 1):
            time.sleep(3)

        goto_basic_page(driver, label=data_label, page_number=page_index)
        time.sleep(1)

        city.get_basic_info(driver, current_line=index_in_pages)
        time.sleep(1)

        print('{}/{}'.format(index + 1,
                             data_city_nums), page_index, city.city_rank,
              city.city_name, city.city_point, city.city_screens)

        goto_device_types_page(driver,
                               current_line=index_in_pages,
                               label=data_sub_label1)
        time.sleep(1)

        city.get_device_types_distribution(driver)
        time.sleep(1)

        driver.get(url)
        goto_basic_page(driver, label=data_label, page_number=page_index)
        time.sleep(1)

        goto_device_scenes_page(driver,
                                current_line=index_in_pages,
                                label=data_sub_label2)

        city.get_device_scenes_distribution(driver)
        time.sleep(1)

        city_dict = {'城市':city.city_name,  \
                     '城市分类':city.city_rank, \
                     '省份':city.city_province, \
                     '所属地区':city.city_location, \
                     '点位数量':city.city_point, \
                     '屏幕数量':city.city_screens, \
                     '类型分布':city.city_device_types_distribution, \
                     '场景分布':city.city_device_scenes_distribution}

        cities_data.append(city_dict)
        write_data(write_file, city_dict, write_type)

    return cities_data
예제 #7
0
    def crawl_daum_comments(url):
        
        chrome_options = webdriver.ChromeOptions()
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome('/home/apostcto/ITDA/chromedriver', chrome_options=chrome_options)
        driver.get(url)

        
        try:
            elements = driver.find_element_by_class_name('alex_more')

        # 마지막 댓글까지 찾기위해 '더보기' 버튼 클릭
        # 100번이상 클릭하면 문제가 있단 뜻이니 50번넘어가면 break
            count = 0
        
            while(elements):
                try :
                    elements.click()
                    time.sleep(0.1)
                    count+=1
                
                    if (count>100):
                        break
                    
                except :
                        break
                    
        # 더보기 버튼이 없으면 현재 페이지에서 댓글 수집 
        except : 
            pass

        # 최대한 확장된 페이지에서 댓글 요소를 찾습니다
    
        comment_lists = []
        

        # 최대한 확장된 페이지에서 댓글 요소를 찾습니다
        try:
            comment_elements = driver.find_element_by_css_selector('.cmt_news').text
            count_elements = driver.find_element_by_css_selector('.cmt_news .alex_single .cmt_count').text
            count_elements = int(count_elements[3:])
        
        
            while len(comment_elements) > 30:
            # 댓글을 검색
                comment_first_index = comment_elements.find('시간전')
                comment_last_index = comment_elements.find('답글')
                comment = comment_elements[comment_first_index+4:comment_last_index-1].replace('\n','')
                
                if('댓글로그인' in comment):
                    comment = ''
                
                comment_lists.append(comment)
            
            # 더이상 댓글이 없으면 break
                if('새로고침' in comment):
                    break
        
            # 다음 댓글을 찾기위해 댓글 요소 slicing
                comment_elements = comment_elements[comment_last_index+3:]
            
            return count_elements, comment_lists, url
        
        except :
            return 0, [], []
예제 #8
0
def main():
    if input('c or d: ') is 'd':
        target_dl_folder_path = r"D:\Temp\H"    # target download folder path
    else:
        target_dl_folder_path = r"C:\Temp\H"

    timeout = 10

    # chrome_options = Options()
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-gpu")
    prefs = {"download.default_directory": target_dl_folder_path}
    chrome_options.add_experimental_option('prefs', prefs)
    # chrome_options.add_experimental_option("detach", True)  # keep brower open
    chrome_options.add_argument("--headless")  # keep brower close
    driver = webdriver.Chrome(options=chrome_options)
    driver.implicitly_wait(10)
    # driver.maximize_window()

    while True:
        urlList = get_urls()

        # START FOR each url in urlList
        for url in urlList:

            # check url not validtion, continue to next url
            if not url_validtion(driver, url):
                continue

            # check download btn if found
            if not find_dl_btn(driver, "dl-button"):
                continue

            print(Fore.CYAN + "Downloading", url + " ... ")
            time.sleep(1)

            if progressbar_timeout(driver, "progressbar", "aria-valuenow", timeout):
                continue

            # START while progessbar
            # TODO progressbar stops downloadning stop, retry
            progressbar_value = '0.000'    # init progressbar
            while progressbar_value != "100":
                progressbar_value = driver.find_element_by_id(
                    "progressbar").get_attribute("aria-valuenow")
                sys.stdout.write("\r{0}".format(
                    str(progressbar_value)[:5] + " %"))
                sys.stdout.flush()
                time.sleep(1)
            print()
            time.sleep(1)
            # END while progessbar

            # old file name
            url_path = urlparse(url).path
            try:
                old_file_name = url_path.split("/")[3]
            except IndexError:
                print(Fore.RED + "GET OLD FILE NAME FROM URL ERROR: ", url)
                continue    # go to next url

            # new file name
            potential_filenames = driver.find_elements_by_class_name(
                "alert-success")
            driver.implicitly_wait(10)
            new_file_name = get_correct_file_name(
                potential_filenames, old_file_name)

            # check if file exists in locol folder
            # NOT exists, 3rd arg represent timeout second*2
            if file_timeout(target_dl_folder_path, old_file_name, timeout):
                print(Fore.RED + "DOWNLOAD FAIL", new_file_name)
                print(Fore.CYAN + "RE-DOWNLOAD ...")
                retry_task(url, urlList)
            else:   # file exists
                # replace file name
                try:
                    os.rename(target_dl_folder_path + "\\" + old_file_name + ".zip ",
                              target_dl_folder_path + "\\" + new_file_name + ".zip")
                except os.error:
                    print(Fore.YELLOW + "CAN NOT RENAME ",
                          old_file_name + " -> " + new_file_name)
                finally:
                    print(Fore.GREEN + 'DOWNLOAD COMPLETE',
                          new_file_name + ".zip")

            # TODO unzip

        # END FOR LOOP urlList
    # END WHILE True

    driver.quit()
def lambda_handler(event, context):
    try:

        options = webdriver.ChromeOptions()
        options.binary_location = "./bin/headless-chromium"
        options.add_argument('--headless')
        options.add_argument("--no-sandbox")
        options.add_argument("--single-process")
        browser = webdriver.Chrome("./bin/chromedriver",
                                   chrome_options=options)

        # 都道府県一覧を取得:「HeartRails Geo API」のサービスを利用する(郵便番号/住所/緯度経度などの地理情報を無料で提供 http://geoapi.heartrails.com/)
        # 全都道府県だとスクレイピングが大変なので、関東、もしくは東京に限定してみる
        areaParams = {}
        areaParams['area'] = '関東'
        json_areaParam = json.dumps(areaParams).encode('utf-8')
        prefectures_response = requests.get(
            'http://geoapi.heartrails.com/api/json?method=getPrefectures',
            data=json_areaParam,
            headers={'Content-Type': 'application/json'})
        prefectures = prefectures_response.json()['response']['prefecture']
        # 東京都に限定
        prefectures = [
            prefecture for prefecture in prefectures_response.json()
            ['response']['prefecture'] if prefecture == '東京都'
        ]
        towns = []
        for prefecture in prefectures:
            # 取得した都道府県一覧から、町域情報を取得
            params = {}
            params['prefecture'] = prefecture
            json_param = json.dumps(params).encode('utf-8')
            towns_response = requests.get(
                'http://geoapi.heartrails.com/api/json?method=getTowns',
                data=json_param,
                headers={'Content-Type': 'application/json'})
            cities = towns_response.json()['response']['location']
            cities = [
                testCity
                for testCity in towns_response.json()['response']['location']
                if testCity['city'] == '新宿区'
            ]
            # townsのLIST内に、取得した都道府県毎の町域辞書を追加する(0~46)
            towns.append(cities)

        # townsWeathers = []
        townsWeathers = scriping_weather(browser, towns)

        # S3バケットの設定
        bucket = 'jdmc2019-weather'
        key = 'weather_' + datetime.now().strftime(
            '%Y-%m-%d-%H-%M-%S') + '.txt'

        # 取得した気象データをjson形式で保存
        files = json.dumps(townsWeathers,
                           indent=4,
                           sort_keys=True,
                           separators=(',', ': '))

        # DynamoDBのテーブルインスタンス作成(sequenceテーブル)
        seqtable = dynamodb.Table('sequence')

        # 取得した気象データをDynamoDBに一括保存する。
        tablename = "weather"
        table = dynamodb.Table(tablename)

        with table.batch_writer() as batch:
            for weather in townsWeathers:
                batch.put_item(
                    Item={
                        'id': next_seq(seqtable, 'weather'),
                        'prefuctureName': weather['prefuctureName'],
                        'cityName': weather['cityName'],
                        'townName': weather['townName'],
                        'longitude': weather['longitude'],
                        'latitude': weather['latitude'],
                        'postalCode': weather['postalCode'],
                        'date': weather['date'],
                        'hour': weather['hour'],
                        'weather': weather['weather'],
                        'temperature': weather['temperature'],
                        'probPrecip': weather['probPrecip'],
                        'precipitation': weather['precipitation'],
                        'humidity': weather['humidity'],
                        'windBlow': weather['windBlow'],
                        'windSpeed': weather['windSpeed']
                    })

        obj = s3.Object(bucket, key)
        obj.put(Body=files)

        # 後始末
        browser.close()
        browser.quit()

        return

    except Exception as error:
        LOGGER.error(error)
        raise error
예제 #10
0
    def crawl_trend(self):
        TEST_URL1 = 'https://www.melon.com/chart/'
        TEST_URL2 = 'https://www.melon.com/chart/#params%5Bidx%5D=51'
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        # UserAgent값을 바꿔줍시다!
        options.add_argument(
            "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
        )
        driver_1_to_50 = None
        driver_51_to_100 = None
        melon_chart_artist_title = {}
        try:
            driver_1_to_50 = webdriver.Chrome(
                chromedriver_binary.chromedriver_filename,
                chrome_options=options)
            driver_1_to_50.get(TEST_URL1)
            title_1_to_50 = WebDriverWait(driver_1_to_50, 3).until(
                EC.presence_of_all_elements_located((
                    By.CSS_SELECTOR,
                    "#lst50 > td:nth-child(6) > div > div > div.ellipsis.rank01"
                )))
            artist_1_to_50 = WebDriverWait(driver_1_to_50, 3).until(
                EC.presence_of_all_elements_located((
                    By.CSS_SELECTOR,
                    "#lst50 > td:nth-child(6) > div > div > div.ellipsis.rank02"
                )))
            link_1_t0_50 = WebDriverWait(driver_1_to_50, 3).until(
                EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR,
                     "#lst50 > td:nth-child(9) > div > button")))

            driver_51_to_100 = webdriver.Chrome(
                chromedriver_binary.chromedriver_filename,
                chrome_options=options)
            driver_51_to_100.get(TEST_URL2)
            title_51_to_100 = WebDriverWait(driver_51_to_100, 3).until(
                EC.presence_of_all_elements_located((
                    By.CSS_SELECTOR,
                    "#lst100 > td:nth-child(6) > div > div > div.ellipsis.rank01"
                )))
            artist_51_to_100 = WebDriverWait(driver_51_to_100, 3).until(
                EC.presence_of_all_elements_located((
                    By.CSS_SELECTOR,
                    "#lst100 > td:nth-child(6) > div > div > div.ellipsis.rank02"
                )))
            link_51_tp_100 = WebDriverWait(driver_51_to_100, 3).until(
                EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR,
                     "#lst100 > td:nth-child(9) > div > button")))

            title_webelement = title_1_to_50 + title_51_to_100
            artist_webelement = artist_1_to_50 + artist_51_to_100
            link_webelement = link_1_t0_50 + link_51_tp_100
            artist = {}
            title = {}
            link = {}
            for idx in range(0, len(artist_webelement)):
                artist[idx] = artist_webelement[idx].text
                title[idx] = title_webelement[idx].text
                link[idx] = link_webelement[idx].get_attribute("onclick")
            melon_chart_artist_title[0] = artist
            melon_chart_artist_title[1] = title
            melon_chart_artist_title[2] = link
        except (WebDriverException, TimeoutException, NoSuchElementException):
            logging.error(traceback.format_exc())
        finally:
            driver_1_to_50.quit()
            driver_51_to_100.quit()
        return melon_chart_artist_title
예제 #11
0
 def setUp(self):
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('headless')
     self.browser = webdriver.Chrome(chrome_options=chrome_options)
     self.browser.implicitly_wait(3)
	def getPlayerStats():
	    chrome_options = webdriver.ChromeOptions()
	    prefs = {"profile.managed_default_content_settings.images": 2}
	    chrome_options.add_experimental_option("prefs", prefs)
	    driver = webdriver.Chrome(chrome_options=chrome_options)
	    # connect webdriver
	    url = "https://www.fifaindex.com/players/1/?league=13&order=desc"
	    driver.get(url)

	    # wait for getting data
	    time.sleep(3)
	    count = 1
	    # click event for gettig data
	    relevant_features = ['Name','Ball Skills','Defence','Mental','Passing','Physical','Shooting','Goalkeeper','Traits']
	    #print relevant_features[0]
	    while True:
		players = driver.find_elements_by_css_selector("a.link-player")
		#print len(players)
		flag = 0
		footballers = dict()
		for player in players:
			if not player.text:
			    continue
			match_url =  player.get_attribute("href")
			man = player.text
			driver.execute_script("window.open('"+match_url+"', 'new_window')")
			time.sleep(5)
			driver.switch_to.window(driver.window_handles[-1])
			elements = driver.find_elements_by_css_selector(".card.mb-5")
			#man = str(player.text)
			print len(elements)        
			footballers[man] = {}
			nm = 0;
			footballers[man]['Name'] = list()
			footballers[man]['Name'].append(man)
			footballers[man]['Traits']=list()
			for element in elements:
			    card_name = element.find_elements_by_css_selector(".card-header")[0].text
			    print card_name,nm
			    nm = nm+1
			    if card_name not in relevant_features:
				continue
			    card_name = card_name
			    if card_name not in ['Traits']:
				footballers[man][card_name] = list()
			    card_values = element.find_elements_by_xpath(".//div[@class='card-body']/p")

			    for values in card_values:
				temp =  str(values.text).split('\n')
				if len(temp)==1:
				    footballers[man]['Traits'].append(temp[0])
				else:
				    footballers[man][card_name].append(temp[1])
			csv_file = "PL_Fifa_Data.csv"
			
			with open(csv_file, 'a') as f:
				w = csv.DictWriter(f, relevant_features)
				w.writerow(footballers[man])

			driver.close()
			driver.switch_to.window(driver.window_handles[0])
			#driver.execute_script("window.history.go(-1)")
		count = count+1
		url = "https://www.fifaindex.com/players/"+str(count)+"/?league=13&order=desc"
		driver.get(url)
		time.sleep(3)
	    driver.close()
예제 #13
0
 def __init__(self):
     self.mensagem = "Olá estou testando o meu bot que fiz com pythom!"
     self.grupos = ["Grupo de teste"]
     options = webdriver.ChromeOptions()
     options.add_argument('lang=pt-br')
     self.driver = webdriver.Chrome(executable_path=r'./chromedriver.exe')
예제 #14
0
def index():
    if request.method == 'POST':

        #Input Constants
        orgName = request.form['content']
        urlName = orgName.replace(' ', '+')
        urlNameNews = orgName.replace(' ', '%20')

        #Functionality Constants
        options = webdriver.ChromeOptions()
        ua = UserAgent(verify_ssl=False)
        userAgent = ua.random
        options.add_argument(f'user-agent={userAgent}')
        options.add_argument("start-maximized")
        options.headless=True
        driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        
        #Setting up IRS headles browsing page
        driver.get("https://apps.irs.gov/app/eos/")
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, """//*[@id="eos-search-by-select"]""")))
        driver.find_element_by_xpath("//select[@name='searchBy']/option[text()='Organization Name']").click()
        textBox = driver.find_element_by_id('names')
        textBox.send_keys(orgName)
        driver.find_element_by_xpath("""//*[@id="s"]""").click()
        time.sleep(1)
        irsHbp = driver.window_handles[0]
        
        #Setting up other two headless browsing pages
        driver.execute_script("window.open('');")
        charNavHbp = driver.window_handles[1]
        driver.execute_script("window.open('');")
        newsHbp = driver.window_handles[2]

        #General Constants
        credsToI = "According to the IRS website..."
        credsToII = "According to charitynavigator.org..."
        symbolsAlert = "Search terms in the Name field can only include letters, numbers, @, &, %, (), *, hyphens, slashes spaces, apostrophes, periods, commas, and quotation marks."
        meaning = "What does this mean?"
        readFiveOh = "Read more about 501(c)3 Organizations"
        ccSummary = "Charity Checker's \"" + str(orgName) + "\" summary:"
        c1Explain = "\"" + str(orgName) + "\" is not a certified or valid nonprofit and it does not exist on charitynavigator.org, please make sure you are entering the correct name."
        c2Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit, but it does not exist on charitynavigator.org so it's stats are unclear."
        c3Explain = "\"" + str(orgName) + "\" is not a certified or valid nonprofit, but it does exist on charitynavigator.org"
        c4Explain = "\"" + str(orgName) + "\" is not a certified and valid nonprofit, but it has great ratings." 
        c5Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit, but does not have any ratings yet."         
        c6Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit with great ratings."

        #Circumstance 1, 3, 4, does not exist on IRS site
        badOrgNotice = "\"" + str(orgName) + "\"" + " is not listed as a 501(c)3 by the IRS."
        badExplain = "Essentially, the organization you entered is not an actual established nonprofit organization and is not likely to be exempt from federal tax income."

        #Circumstances 2, 5, 6, does exist on IRS site
        goodOrgNotice = "\"" + str(orgName) + "\"" + " is listed as a 501(c)3 by the IRS."
        goodExplain = "Essentially, the organization you entered is an actual established nonprofit organization and can be exempt from federal tax income."
        goodReadData = "Read more about the Tax Return Copies, Pub 78 Data, Auto-Revocation Lists, Determination Letters, or e-Postcards of the organization you entered"

        #Circumstances 1 & 2, does not exist on charitynavigator.org
        noRatingsYetII = "\"" + str(orgName) + "\" did not share any stats yet."

        #Circumstances 3 & 5, no full info on charitynavigator.org
        notEnoughInfoII = "The organization you entered has not provided needed information for a complete rating."
        badCharNavUrlClickMe = "However, you can still see some of the organization's information here"

        #Circumstances 4 & 6, full info on charitynavigator.org
        goodCharNavUrlClickMe = "View more of your organization's stats"

        #Getting news links
        driver.switch_to_window(newsHbp)
        news01 = 'https://news.google.com/search?q=' + urlNameNews + '&hl=en-US&gl=US&ceid=US%3Aen'
        driver.get(news01)
        newsOffer = "Here is a recent news article about \"" + str(orgName) + "\":"
        newsMore = "View more"
        noNews = "No recent news article about \"" + str(orgName) + "\" was found"
        newsLook = "But you can search for older news articles about your organization here"
        if len(driver.find_elements_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a')) >= 1:
            global newsTitleA, newsLinkA
            newsTitleA = driver.find_element_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a').text
            newsLinkA = driver.find_element_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a').get_attribute('href')

        #Checking for status on IRS site
        driver.switch_to_window(irsHbp)
        if len(driver.find_elements_by_xpath("//*[contains(text(), 'Your search did not return any results. Please try again.')]")) == 1: #Does not exist on IRS site
            #Checking for status on charitynavigator.org
            driver.switch_to_window(charNavHbp)
            driver.get('https://www.charitynavigator.org/index.cfm?keyword_list=' + urlName + '&bay=search.results')
            time.sleep(3)
            if len(driver.find_elements_by_xpath("//*[contains(text(), 'Advanced Search')]")) > 1: #Does not exist on charitynavigator.org
                #Circumstance 1: Doesn't exist on irs site or charitynavigator.org, now determining news links
                driver.switch_to_window(newsHbp)
                if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                    return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, noRatingsYetII=noRatingsYetII, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c1Explain=c1Explain)
                else:
                    return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, noRatingsYetII=noRatingsYetII, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c1Explain=c1Explain)

            else: #Does exist on charitynavigator.org
                driver.find_element_by_xpath('//*[@id="searchresults"]/table[1]/tbody/tr[1]/td[1]/div/h3/a').click()
                time.sleep(1)
                if len(driver.find_elements_by_xpath("//*[contains(text(), 'our old design')]")) > 1: #Exists on charitynavigator.org, does not have full info
                    badCharNavUrl = driver.current_url
                    #Circumstance 3: Doesn't exist on irs site but has partial info on charitynavigator.org, now determining news links
                    driver.switch_to_window(newsHbp)
                    if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                        return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c3Explain=c3Explain)
                    else:
                        return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c3Explain=c3Explain)

                else: #Exists on charitynavigator.org, does have full info
                    overallRating1 = "Overall Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[2]/td[2]").text
                    financialRating1 = "Financial Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[3]/td[2]").text
                    antRating1 = "Accountability & Transparency Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[4]/td[2]").text
                    programExpenses1 = "Percent of Charity's total expenses spent on the programs/services it delivers: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[1]/td[3]").text
                    adminExpenses1 = "Administrative Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[2]/td[3]").text
                    fundraisingExpenses1 = "Fundraising Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[3]/td[3]").text
                    goodCharNavUrl = driver.current_url
                    #Circumstance 4: Doesn't exist on irs site but has full info on charitynavigator.org, now determining news links
                    driver.switch_to_window(newsHbp)
                    if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                        return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c4Explain=c4Explain)
                    else:
                        return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c4Explain=c4Explain)

        #Invalid characters in IRS name field
        elif len(driver.find_elements_by_xpath("//*[contains(text(), 'You have entered invalid characters in the Name field.')]")) == 1:
            return render_template('index.html', symbolsAlert=symbolsAlert)

        #Checking for status on IRS site
        else: #Does show up on IRS site
            driver.find_element_by_xpath("""/html/body/div[2]/div[2]/div/div/div[1]/div/div[2]/div/ul/li/h3/a""").click()
            goodIrsInfoUrl = driver.current_url

            #Checking for status on charitynavigator.org
            driver.switch_to_window(charNavHbp)
            driver.get('https://www.charitynavigator.org/index.cfm?keyword_list=' + urlName + '&bay=search.results')
            time.sleep(3)
            if len(driver.find_elements_by_xpath("//*[contains(text(), 'Advanced Search')]")) > 1: #Does not exist on charitynavigator.org
                #Circumstance 2: Does exist on irs site but not charitynavigator.org, now determining news links
                driver.switch_to_window(newsHbp)
                if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                    return render_template('index.html',  credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, noRatingsYetII=noRatingsYetII, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c2Explain=c2Explain)
                else:
                    return render_template('index.html',  credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, noRatingsYetII=noRatingsYetII, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c2Explain=c2Explain)

            else: #Does exist on charitynavigator.org
                driver.find_element_by_xpath('//*[@id="searchresults"]/table[1]/tbody/tr[1]/td[1]/div/h3/a').click()
                time.sleep(1)
                if len(driver.find_elements_by_xpath("//*[contains(text(), 'our old design')]")) > 1: #Exists on charitynavigator.org, does not have full info
                    badCharNavUrl = driver.current_url
                    #Circumstance 5: Does exist on irs site but has partial info on charitynavigator.org, now determining news links
                    if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                        return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c5Explain=c5Explain)
                    else:
                        return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c5Explain=c5Explain)

                else: #Exists on charitynavigator.org, does have full info
                    overallRating1 = "Overall Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[2]/td[2]").text
                    financialRating1 = "Financial Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[3]/td[2]").text
                    antRating1 = "Accountability & Transparency Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[4]/td[2]").text
                    programExpenses1 = "Percent of Charity's total expenses spent on the programs/services it delivers: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[1]/td[3]").text
                    adminExpenses1 = "Administrative Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[2]/td[3]").text
                    fundraisingExpenses1 = "Fundraising Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[3]/td[3]").text
                    goodCharNavUrl = driver.current_url
                    #Circumstance 6: Does exist on irs site and has full info on charitynavigator.org, now determining news links
                    if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1:
                        return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c6Explain=c6Explain)
                    else:
                        return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c6Explain=c6Explain)

    else:
        return render_template('index.html')
예제 #15
0
파일: __init__.py 프로젝트: vihndsm/Python
def sendwhatmsg_with_selenium(phone_no,
                              message,
                              time_hour,
                              time_min,
                              print_messages=True):
    """Same as sendwhatmsg() function, but this will not open chrome
Most of the process will be hidden, only a console will open
***If this is the first time\nYou must call pywhatkit.load_QRcode() and pywhatkit.add_driver_path(path)\nbefore claing this function or you will get error
Make sure whatsapp web is not already opened or you might get your number banned"""
    global sleeptm, path, headless_mode, curpth
    if "+" not in phone_no:
        raise CountryCodeException("Country code missing from phone_no")
    timehr = time_hour

    with open("pywhatkit_dbs.txt") as file:
        for lines in file:
            if "selpath" in lines:
                path = lines.replace("selpath : ", "")
    path = path.strip()

    if time_hour not in range(0, 25) or time_min not in range(0, 60):
        print("Invalid time format")

    if time_hour == 0:
        time_hour = 24
    callsec = (time_hour * 3600) + (time_min * 60)

    curr = time.localtime()
    currhr = curr.tm_hour
    currmin = curr.tm_min
    currsec = curr.tm_sec

    currtotsec = (currhr * 3600) + (currmin * 60) + (currsec)
    lefttm = callsec - currtotsec

    if lefttm <= 0:
        lefttm = 86400 + lefttm

    if lefttm < 60:
        raise CallTimeException(
            "Call time must be greater than one minute as web.whatsapp.com takes some time to load"
        )

    date = "%s:%s:%s" % (curr.tm_mday, curr.tm_mon, curr.tm_year)
    time_write = "%s:%s" % (timehr, time_min)
    file = open("pywhatkit_dbs.txt", "a")
    file.write("Date: %s\nTime: %s\nPhone number: %s\nMessage: %s" %
               (date, time_write, phone_no, message))
    file.write("\n--------------------\n")
    file.close()
    sleeptm = lefttm - 60
    if print_messages:
        print(f"In {prnt_sleeptm()+60} seconds message will be delivered")
    time.sleep(sleeptm)

    options = webdriver.ChromeOptions()

    options.add_argument("--window-size=1920x1080")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
    )
    options.add_argument('--user-data-dir=%s/pywhatkit_data' % curpth)
    if headless_mode:
        options.add_argument("--headless")
    driver = webdriver.Chrome(path, options=options)
    url = ('https://web.whatsapp.com/send?phone=' + phone_no)
    driver.get(url)
    time.sleep(45)
    msg_box = driver.find_element_by_xpath(
        '//div[@contenteditable="true"][@data-tab="1"]')
    time.sleep(14)
    msg_box.send_keys(message + "\n")
    if print_messages:
        print("Message sent\nYou may close the console window now")
예제 #16
0
def crawl_image_urls(keywords, engine="Google", max_number=10000,
                     face_only=False, safe_mode=False, proxy=None, 
                     proxy_type="http", quiet=False, browser="phantomjs", image_type=None, color=None):
    """
    Scrape image urls of keywords from Google Image Search
    :param keywords: keywords you want to search
    :param engine: search engine used to search images
    :param max_number: limit the max number of image urls the function output, equal or less than 0 for unlimited
    :param face_only: image type set to face only, provided by Google
    :param safe_mode: switch for safe mode of Google Search
    :param proxy: proxy address, example: socks5 127.0.0.1:1080
    :param proxy_type: socks5, http
    :param browser: browser to use when crawl image urls from Google & Bing 
    :return: list of scraped image urls
    """

    my_print("\nScraping From {0} Image Search ...\n".format(engine), quiet)
    my_print("Keywords:  " + keywords, quiet)
    if max_number <= 0:
        my_print("Number:  No limit", quiet)
        max_number = 10000
    else:
        my_print("Number:  {}".format(max_number), quiet)
    my_print("Face Only:  {}".format(str(face_only)), quiet)
    my_print("Safe Mode:  {}".format(str(safe_mode)), quiet)

    if engine == "Google":
        query_url = google_gen_query_url(keywords, face_only, safe_mode, image_type, color)
    elif engine == "Bing":
        query_url = bing_gen_query_url(keywords, face_only, safe_mode, image_type, color)
    elif engine == "Baidu":
        query_url = baidu_gen_query_url(keywords, face_only, safe_mode, color)
    else:
        return

    my_print("Query URL:  " + query_url, quiet)

    if engine != "Baidu":
        browser = str.lower(browser)
        if "chrome" in browser:
            chrome_path = shutil.which("chromedriver")
            if platform.system() == 'Darwin':
                chrome_path = "./macos/bin/chromedriver-mac"
            chrome_options = webdriver.ChromeOptions()
            if "headless" in browser:
                chrome_options.add_argument("headless")
            if proxy is not None and proxy_type is not None:
                chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy))
            driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)
        else:
            phantomjs_path = shutil.which("phantomjs")
            phantomjs_path = "./bin/phantomjs" if phantomjs_path is None else phantomjs_path
            phantomjs_args = []
            if proxy is not None and proxy_type is not None:
                phantomjs_args += [
                    "--proxy=" + proxy,
                    "--proxy-type=" + proxy_type,
                ]
            driver = webdriver.PhantomJS(executable_path=phantomjs_path,
                                        service_args=phantomjs_args, desired_capabilities=dcap)

    if engine == "Google":
        driver.set_window_size(1920, 1080)
        driver.get(query_url)
        image_urls = google_image_url_from_webpage(driver, max_number, quiet)
    elif engine == "Bing":
        driver.set_window_size(1920, 1080)
        driver.get(query_url)
        image_urls = bing_image_url_from_webpage(driver)
    else:   # Baidu
        # driver.set_window_size(10000, 7500)
        # driver.get(query_url)
        # image_urls = baidu_image_url_from_webpage(driver)
        image_urls = baidu_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only,
                                                   proxy=proxy, proxy_type=proxy_type)
    if engine != "Baidu":
        driver.close()

    if max_number > len(image_urls):
        output_num = len(image_urls)
    else:
        output_num = max_number

    my_print("\n== {0} out of {1} crawled images urls will be used.\n".format(
        output_num, len(image_urls)), quiet)

    return image_urls[0:output_num]
예제 #17
0
파일: __init__.py 프로젝트: vihndsm/Python
def send_file(phone_no,
              path_to_file,
              time_hour,
              time_min,
              print_messages=True):
    """Send file of any format (png, mp3, txt etc)"""
    global sleeptm, path, headless_mode, curpth
    if "+" not in phone_no:
        raise CountryCodeException("Country code missing from phone_no")
    timehr = time_hour

    if not os.path.exists(path_to_file):
        raise FilePathException("No file found at %s" % path_to_file)

    with open("pywhatkit_dbs.txt") as file:
        for lines in file:
            if "selpath" in lines:
                chrpath = lines.replace("selpath : ", "")
    chrpath = chrpath.strip()

    if time_hour not in range(0, 25) or time_min not in range(0, 60):
        print("Invalid time format")

    if time_hour == 0:
        time_hour = 24
    callsec = (time_hour * 3600) + (time_min * 60)

    curr = time.localtime()
    currhr = curr.tm_hour
    currmin = curr.tm_min
    currsec = curr.tm_sec

    currtotsec = (currhr * 3600) + (currmin * 60) + (currsec)
    lefttm = callsec - currtotsec

    if lefttm <= 0:
        lefttm = 86400 + lefttm

    if lefttm < 60:
        raise CallTimeException(
            "Call time must be greater than one minute as web.whatsapp.com takes some time to load"
        )

    date = "%s:%s:%s" % (curr.tm_mday, curr.tm_mon, curr.tm_year)
    time_write = "%s:%s" % (timehr, time_min)
    file = open("pywhatkit_dbs.txt", "a")
    file.write("Date: %s\nTime: %s\nPhone number: %s\nAttachment: %s" %
               (date, time_write, phone_no, path_to_file))
    file.write("\n--------------------\n")
    file.close()
    sleeptm = lefttm - 60
    if print_messages:
        print(f"In {prnt_sleeptm()+60} seconds message will be delivered")
    time.sleep(sleeptm)
    options = webdriver.ChromeOptions()
    pth = os.getcwd() + './pywhatkit'
    options.add_argument("--window-size=1920x1080")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
    )
    options.add_argument('--user-data-dir=%s/pywhatkit_data' % curpth)
    if headless_mode:
        options.add_argument("--headless")
    url = ('https://web.whatsapp.com/send?phone=' + phone_no)
    driver = webdriver.Chrome(chrpath, options=options)
    driver.get(url)
    time.sleep(40)
    driver.find_element_by_xpath('//span[@data-icon="clip"]').click()
    time.sleep(1)
    attch = driver.find_element_by_xpath(
        '//input[@accept="image/*,video/mp4,video/3gpp,video/quicktime"]')
    attch.send_keys(path_to_file)
    time.sleep(10)
    snd = driver.find_element_by_xpath('//span[@data-icon="send"]')
    time.sleep(4)
    snd.click()
    if print_messages:
        print(
            "Message sent\nIf it is a big file, it might take longer time to be delivered\nClose console only after message gets delivered."
        )
예제 #18
0
    def upload(self, file_list, link):

        filename = 'engine/bilibili.cookie'
        # title_ = self.r_title
        videopath = self.assemble_videopath(file_list)

        # service_log_path = "{}/chromedriver.log".format('/home')
        options = webdriver.ChromeOptions()

        options.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=engine.chromedrive_path,
                                       chrome_options=options)
        # service_log_path=service_log_path)
        try:
            self.driver.get("https://www.bilibili.com")
            # driver.delete_all_cookies()
            if os.path.isfile(filename):
                with open(filename) as f:
                    new_cookie = json.load(f)

                for cookie in new_cookie:
                    # print(cookie)
                    if isinstance(cookie.get("expiry"), float):
                        cookie["expiry"] = int(cookie["expiry"])
                    self.driver.add_cookie(cookie)

            self.driver.get("https://member.bilibili.com/video/upload.html")

            # print(driver.title)
            self.add_videos(videopath)

            # js = "var q=document.getElementsByClassName('content-header-right')[0].scrollIntoView();"
            # driver.execute_script(js)

            cookie = self.driver.get_cookies()
            with open(filename, "w") as f:
                json.dump(cookie, f)

            self.add_information(link)

            self.driver.find_element_by_xpath(
                '//*[@class="upload-v2-container"]/div[2]/div[3]/div[5]/span[1]'
            ).click()
            # screen_shot = driver.save_screenshot('bin/1.png')
            # print('截图')
            time.sleep(3)
            upload_success = self.driver.find_element_by_xpath(
                r'//*[@id="app"]/div/div[3]/h3').text
            if upload_success == '':
                self.driver.save_screenshot('err.png')
                logger.info('稿件提交失败,截图记录')
                return
            else:
                logger.info(upload_success)
            # print('稿件提交完成!')
            # logger.info('%s提交完成!' % title_)
            self.remove_filelist(file_list)
        except selenium.common.exceptions.NoSuchElementException:
            logger.exception('发生错误')
        # except selenium.common.exceptions.TimeoutException:
        #     logger.exception('超时')
        except selenium.common.exceptions.TimeoutException:
            self.login(filename)

        finally:
            self.driver.quit()
            logger.info('浏览器驱动退出')
예제 #19
0
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--incognito')
        # options.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=options)
예제 #20
0
 def open_chrome(self, executable_path="../driver/chromedriver"):
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('--no-sandbox') # refer: https://stackoverflow.com/questions/43008622/python-linux-selenium-chrome-not-reachable
     chrome_options.add_argument('--disable-gpu')
     self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options)
     self.browser.get(self.init_url)
def get_npc_all(newtai_url_list):
    driver_path = "./chromedriver"

    option = webdriver.ChromeOptions()
    option.add_argument('--windows-size=1280,1024')
    browser = webdriver.Chrome(executable_path=driver_path, chrome_options=option)

    browser.get('https://rent.591.com.tw/?kind=0&region=1')
    browser.implicitly_wait(1)

    browser.find_element_by_xpath("//dd[@data-id=3]").click()

    while True:
        html_source = browser.page_source

        soup = bs4.BeautifulSoup(html_source, 'html.parser')

        next_page = soup.find_all("a", class_="last")

        print(len(next_page))

        if len(next_page) == 0:

            h3_list = soup.find_all("h3")
            title_list = []
            for i in h3_list:
                tmp = i.find_all("a")
                title_list.append(tmp[0])

            for j in title_list:
                tmp_str = re.findall("rent.591\\S+html", str(j))[0]
                # print(tmp_str)
                newtai_url_list.append("https://%s" % tmp_str)
            try:
                browser.find_element_by_class_name("pageNext").click()
            except Exception as ex:
                try:
                    browser.find_element_by_class_name("pageNext").click()
                except Exception as ex:
                    browser.find_element_by_class_name("pageNext").click()

        elif len(next_page) == 1:

            h3_list = soup.find_all("h3")
            title_list = []
            for i in h3_list:
                tmp = i.find_all("a")
                title_list.append(tmp[0])

            for j in title_list:
                tmp_str = re.findall("rent.591\\S+html", str(j))[0]
                # print(tmp_str)
                newtai_url_list.append("https://%s" % tmp_str)

            break

    # newtai_url_list = list(set(newtai_url_list))
    # print(newtai_url_list)
    # print(len(newtai_url_list))

    browser.close()

    browser.quit()
예제 #22
0
class TumblrReg():
    def tf_get_proxy():
        # proxies = set()
        proxy_list = list()
        try:
            url = "https://free-proxy-list.net/"
            adapter = HTTPAdapter(max_retries=2)

            request_session = requests.Session()
            request_session.mount(url, adapter)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            }
            r = requests.get(url, headers=headers, verify=False, timeout=5)
            soup = BeautifulSoup(r.content, 'html.parser')
            proxy_data = soup.select('td:nth-child(2) , td:nth-child(1)')
            for i in range(0, 20, 2):
                proxy_list.append(
                    str(proxy_data[i].text) + ':' +
                    str(proxy_data[i + 1].text))
            # proxy = str(proxy_data[i].text) + ':' + str(proxy_data[i + 1].text)
            # proxies.add(proxy)
            return proxy_list
        except Exception as e:
            print(" --->", e)
            pass

    ALL_PROXIES = tf_get_proxy()

    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--window-position=0,0')
    options.add_argument('--disable-infobars')
    options.add_argument('--window-size=1920,1080')

    def tf_proxy_driver(PROXIES, options=options):
        pxy = ''
        # if PROXIES:
        #     pxy = PROXIES[-1]
        # else:
        #     print("--- Proxies used up (%s)" % len(PROXIES))
        #
        # options.add_argument('--proxy-server=%s' % pxy)

        opts = ChromeOptions()
        opts.add_experimental_option("detach", True)
        # driver = Chrome(chrome_options=opts)

        Chromedriver = webdriver.Chrome(Chrome_path, chrome_options=opts)
        Chromedriver.implicitly_wait(Wait_3)
        print("proxy is - ", pxy)
        return Chromedriver

    def tf_use_same_session(ChromeDriver):
        executor_url = ChromeDriver.command_executor._url  # "http://127.0.0.1:60622/hub"
        session_id = ChromeDriver.session_id  # '4e167f26-dc1d-4f51-a207-f761eaf73c31'
        print(session_id)
        driver_temp = webdriver.Remote(command_executor=executor_url,
                                       desired_capabilities={})
        driver_temp.close()
        driver_temp.session_id = session_id
        return driver_temp

    def tf_check_folder_path(new_folder_create=""):
        path = os.path.dirname(os.getcwd())
        path = path + "/" + "source_page_screen_shot_media"
        new_folder = date.today()
        directory = path + "/" + str(new_folder)
        if not os.path.exists(directory):
            os.makedirs(directory)
        if new_folder_create != "":
            directory = directory + "/" + str(new_folder_create)
            if not os.path.exists(directory):
                os.makedirs(directory)
                return directory
            else:
                return directory

    def tf_check_and_rename(file, add=0):
        original_file = file
        if add != 0:
            split = file.split(".")
            part_1 = split[0] + "_" + str(add)
            file = ".".join([part_1, split[1]])
        if not os.path.isfile(file):
            os.rename(original_file, file)
        else:
            add += 1
            TumblrReg.tf_check_and_rename(original_file, add)

    def tf_screen_shots(driver, scroll_delay=0.3):
        path = TumblrReg.tf_check_folder_path("screenshot")
        title = driver.title
        if title != "":
            title_length = len(str(title))
            if title_length > 26:
                title = title.replace("@", "").replace("/", "").replace(
                    "$", "").replace(".", "").replace(":",
                                                      "").replace("|", "")
                title = str(title)[0:25]
        else:
            title = driver.current_url
            title = title.replace("@", "").replace("/", "").replace(
                "$", "").replace(".", "").replace(":", "").replace("|", "")

        file_name = path + "/" + title + ".png"
        if os.path.exists(file_name):
            TumblrReg.tf_check_and_rename(file_name)
        device_pixel_ratio = driver.execute_script(
            'return window.devicePixelRatio')
        total_height = driver.execute_script(
            'return document.body.parentNode.scrollHeight')
        viewport_height = driver.execute_script('return window.innerHeight')
        total_width = driver.execute_script('return document.body.offsetWidth')
        viewport_width = driver.execute_script(
            "return document.body.clientWidth")

        # this implementation assume (viewport_width == total_width)
        assert (viewport_width == total_width)

        # scroll the page, take screenshots and save screenshots to slices
        offset = 0  # height
        slices = {}

        while offset < total_height:
            if offset + viewport_height > total_height:
                offset = total_height - viewport_height

            driver.execute_script('window.scrollTo({0}, {1})'.format(
                0, offset))
            time.sleep(scroll_delay)

            img = Image.open(BytesIO(driver.get_screenshot_as_png()))
            slices[offset] = img

            offset = offset + viewport_height
            if total_height < 10000:
                update_total_height = driver.execute_script(
                    'return document.body.parentNode.scrollHeight')
                if total_height != update_total_height:
                    total_height = update_total_height
        # combine image slices
        stitched_image = Image.new('RGB', (total_width * device_pixel_ratio,
                                           total_height * device_pixel_ratio))
        for offset, image in slices.items():
            stitched_image.paste(image, (0, offset * device_pixel_ratio))
        stitched_image.save(file_name)
        driver.execute_script('window.scrollTo({0}, {1})'.format(0, 0))

    def tf_source_code(driver):
        path = TumblrReg.tf_check_folder_path("sourcepage")
        title = driver.title
        if title != "":
            title_length = len(str(title))
            if title_length > 26:
                title = title.replace("@", "").replace("/", "").replace(
                    "$", "").replace(".", "").replace(":",
                                                      "").replace("|", "")
                title = str(title)[0:25]
            else:
                title = title.replace("@", "").replace("/", "").replace(
                    "$", "").replace(".", "").replace(":",
                                                      "").replace("|", "")
        else:
            title = driver.current_url
            title = title.replace("@", "").replace("/", "").replace(
                "$", "").replace(".", "").replace(":", "").replace("|", "")

        TumblrReg.tf_File_name = path + "/" + title + ".html"
        if os.path.exists(TumblrReg.tf_File_name):
            TumblrReg.tf_check_and_rename(TumblrReg.tf_File_name)
        pagesource = driver.page_source.encode('ascii', 'ignore')
        soup = BeautifulSoup(pagesource, 'html.parser')
        # Create text file, then write page source to the file
        fh = open(TumblrReg.tf_File_name, 'w')
        fh.write(str(soup.prettify()))
        fh.close()

    def tf_Type_driver_scroller(Chromedriver):
        total_height = Chromedriver.execute_script(
            'return document.body.parentNode.scrollHeight')
        viewport_height = Chromedriver.execute_script(
            'return window.innerHeight')
        total_width = Chromedriver.execute_script(
            'return document.body.offsetWidth')
        viewport_width = Chromedriver.execute_script(
            "return document.body.clientWidth")

        # this implementation assume (viewport_width == total_width)
        assert (viewport_width == total_width)

        # scroll the page, take screenshots and save screenshots to slices
        offset = 0  # height
        while offset < total_height:
            if offset + viewport_height > total_height:
                offset = total_height - viewport_height

            Chromedriver.execute_script('window.scrollTo({0}, {1})'.format(
                0, offset))
            time.sleep(Wait_1)

            offset = offset + viewport_height
            if total_height < 10000:
                update_total_height = Chromedriver.execute_script(
                    'return document.body.parentNode.scrollHeight')
                if total_height != update_total_height:
                    total_height = update_total_height
        Chromedriver.execute_script('window.scrollTo({0}, {1})'.format(0, 0))

    #To Url
    def tf_To_url():
        url = 'https://www.tumblr.com/'
        return url

    #Find UI item - Find UI element and click must be put inside the same function
    #Use a separate function to store find UI element might not work in some situations.
    #Click my account
    def tf_Click_Sign_up(Chromedriver):
        try:
            Sign_up = Chromedriver.find_element_by_xpath(
                '//*[@id="signup_forms_submit"]/span[1]')
            Sign_up.click()
            time.sleep(Wait_2)
        except:
            pass

    def tf_Type_email(Chromedriver):
        try:
            email = Chromedriver.find_element_by_id('signup_email')
            email.clear()
            email.send_keys("*****@*****.**")
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_password(Chromedriver):
        try:
            password = Chromedriver.find_element_by_id('signup_password')
            password.clear()
            password.send_keys("Xysbsg@1238#76Bd")
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_user_name(Chromedriver):
        try:
            username = Chromedriver.find_element_by_id('signup_username')
            username.clear()
            username.send_keys("johnbradman2019")
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_user_name_suggest(Chromedriver):
        try:
            username_suggest = Select(
                Chromedriver.find_element_by_id('suggested_usernames'))
            username_suggest.select_by_index(0)
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_do_signup(Chromedriver):
        try:
            do_signup = Chromedriver.find_element_by_xpath(
                '//*[@id="signup_forms_submit"]/span[3]')
            do_signup.click()
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_Age_Confirm(Chromedriver):
        try:
            age_input = Chromedriver.find_element_by_id("signup_age")
            age_input.clear()
            age_input.send_keys("29")
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_User_TandC(Chromedriver):
        try:
            t_and_c = Chromedriver.find_element_by_id("signup_tos")
            t_and_c.click()
            time.sleep(Wait_1)
        except:
            pass

    def tf_do_signup_done(Chromedriver):
        try:
            do_signup = Chromedriver.find_element_by_xpath(
                '//*[@id="signup_forms_submit"]/span[4]')
            do_signup.click()
            time.sleep(Wait_1)
        except:
            pass

    def tf_recaptch_click(Chromedriver):
        try:
            recaptch = Chromedriver.find_element_by_id("recaptcha-anchor")
            recaptch.click()
            time.sleep(Wait_2)
        except:
            pass

    def tf_do_almost_done(Chromedriver):
        try:
            almost_done = Chromedriver.find_element_by_xpath(
                '//*[@id="signup_forms_submit"]/span[5]/span')
            almost_done.click()
            time.sleep(Wait_1)
        except:
            pass

    def tf_Type_skip_1(Chromedriver):
        try:
            skip_1 = Chromedriver.find_element_by_xpath(
                '//*[@id="onboarding_actions_index"]/div[2]/div[3]/div[2]/button[1]'
            )
            skip_1.click()
            time.sleep(Wait_1)
        except:
            pass
예제 #23
0
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

driver_option = webdriver.ChromeOptions()
driver_option.add_argument(" - incognito")
chromedriver_path = '/home/larri/Downloads/chromedriver'
def create_webdriver():
    return webdriver.Chrome(executable_path=chromedriver_path, chrome_options=driver_option)
# -*- coding: utf-8 -*-

from selenium import webdriver
import time

chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}  # 不加载图片
chrome_opt.add_experimental_option('prefs', prefs)

browser = webdriver.Chrome(
    executable_path="/Users/pengtuo/code/Python/ArticleSpider/chromedriver",
    chrome_options=chrome_opt)

# selenium模拟登陆知乎
browser.get('https://www.zhihu.com/#signin')

browser.find_element_by_css_selector(
    '.view-signin input[name="account"]').send_keys('xxx')
browser.find_element_by_css_selector(
    '.view-signin input[name="password"]').send_keys('xxx')
browser.find_element_by_css_selector('.view-signin button.sign-button').click()

# selenium模拟登陆微博
browser.get('http://weibo.com/')
time.sleep(5)  # 等待页面加载再寻找元素

browser.find_element_by_css_selector('input[id="loginname"]').send_keys('xxx')
browser.find_element_by_css_selector(
    '.info_list.password input[node-type="password"]').send_keys('xxx')
browser.find_element_by_css_selector(
    '.info_list.login_btn a[node-type="submitBtn"]').click()
예제 #25
0
 def __init__(self, username, password):
     self.browserProfile = webdriver.ChromeOptions()
     self.browserProfile.add_experimental_option('prefs', {'intl.accept.languages': 'en,en_US'})
     self.browser = webdriver.Chrome('chromedriver.exe', chrome_options=self.browserProfile)
     self.username = username
     self.password = password
 def setUp(self):
     self.options = webdriver.ChromeOptions()
     self.options.add_argument('--incognito')
     self.options.add_argument('--start-maximized')
     self.drivers = []
예제 #27
0
import pandas as pd
from selenium import webdriver
import time
import os
import chromedriver_binary  # sin necesidad de añadir a Path
import shutil
import os
opciones = webdriver.ChromeOptions()
prefs = {
    'download.default_directory':
    'C:\\Users\\luisb\\Code-Font\\prueba\\dataset\\'
}

opciones.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(options=opciones)

browser.get(
    "https://www.worldbank.org/en/projects-operations/procurement/debarred-firms#"
)
time.sleep(20)  # espera para buscar el boton

#verifico si el archivo existe
if os.path.isfile(
        'C:/Users/luisb/Code-Font/prueba/dataset/Sanctioned individuals and firms.xlsx'
):
    print("Eliminando archivo anterior...")
    os.remove(
        'C:/Users/luisb/Code-Font/prueba/dataset/Sanctioned individuals and firms.xlsx'
    )
    #renombro para no borrar y no dañar el siguiente
browser.find_element_by_class_name("k-grid-excel").click()
예제 #28
0
    def Process(self, filepath, name):

        options = webdriver.ChromeOptions()

        profile = {
            "plugins.plugins_list": [{
                "enabled": False,
                "name": "Chrome PDF Viewer"
            }],  # Disable Chrome's PDF Viewer
            "download.default_directory":
            filepath,
            "download.extensions_to_open":
            "applications/pdf"
        }
        options.add_experimental_option("prefs", profile)

        self._driver = webdriver.Chrome(chrome_options=options)
        wait = WebDriverWait(self._driver, 60)
        # Optional argument, if not specified will search path.
        self._driver.get('https://www.au10tixportalusa.com/VanillaRest/')
        wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '/html/body/div/article/div[2]/form/button')))
        ##log in
        self._driver.find_element_by_name('j_username').send_keys(
            self._username)
        self._driver.find_element_by_name('j_password').send_keys(
            self._password)
        self._driver.find_element_by_xpath(
            '/html/body/div/article/div[2]/form/button').click()

        ##upload
        wait.until(
            EC.element_to_be_clickable(
                (By.XPATH,
                 '/html/body/div/article/ng-view/div/section/div/section/a')))
        self._driver.find_element_by_xpath(
            '/html/body/div/article/ng-view/div/section/div/section/a').click(
            )

        wait.until(
            EC.presence_of_element_located((
                By.XPATH,
                '//*[@id="addFilesOneSide"]/div/div/div[2]/div/div[1]/div/div/div[2]/div/input'
            )))

        self._driver.find_element_by_xpath(
            '//*[@id="addFilesOneSide"]/div/div/div[2]/div/div[1]/div/div/div[2]/div/input'
        ).send_keys(filepath + name)

        wait.until(
            EC.element_to_be_clickable(
                (By.XPATH,
                 '//*[@id="addFilesOneSide"]/div/div/div[3]/button[2]')))
        self._driver.find_element_by_xpath(
            '//*[@id="addFilesOneSide"]/div/div/div[3]/button[2]').click()
        wait.until(
            EC.element_to_be_clickable(
                (By.XPATH,
                 '//*[@id="mainTable"]/table/tbody/tr[1]/td[1]/div/label')))
        self._driver.find_element_by_xpath(
            '//*[@id="mainTable"]/table/tbody/tr/td[3]').click()
        handles = self._driver.window_handles
        self._driver.switch_to.window(handles[1])
        wait.until(
            EC.presence_of_element_located(
                (By.XPATH,
                 '/html/body/div/article/ng-view/div/section[1]/div/span')))
        self._this_page = self._driver.page_source
        time.sleep(1)
        #wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div/article/ng-view/div/section[2]/div[2]')))

        try:
            self._driver.find_element_by_xpath(
                '/html/body/div/article/ng-view/div/section[2]/div[2]').click(
                )
            #WebDriverWait(self._driver, 60).until(lambda x: x.find_element_by_xpath('/html/body/div/article/ng-view/div/section[2]/div[2]')).click()

            self._driver.find_element_by_xpath(
                '/html/body/div/article/ng-view/div/section[2]/div[2]/div/ul/li/a'
            ).click()

            self._result = BeautifulSoup(self._this_page,
                                         'lxml').find('span',
                                                      class_='ng-binding').text

            wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, "//*[text()='PDF']")))
            #print(str(BeautifulSoup(self._this_page,'lxml').find(text='PDF').parent))
            self._downloadname = re.findall(
                r"\d/(.+).pdf",
                str(
                    BeautifulSoup(self._this_page,
                                  'lxml').find(text='PDF').parent))[0]
            #self._downloadname = re.findall(r"\d/(.+).pdf", pdf_url)[0]
            #old = max([f for f in os.listdir(filepath)], key=os.path.getctime)

            old = filepath + self._downloadname + '.pdf'
            #print(old)
            while not os.path.exists(old):
                time.sleep(1)

            newfilepath = filepath + os.path.basename(
                os.path.dirname(filepath)) + '-Au10tix.pdf'
            if not os.path.exists(newfilepath):
                os.rename(old, newfilepath)
            else:
                print('File ' + os.path.basename(os.path.dirname(filepath)) +
                      '-Au10tix.pdf' + ' exists.')
        except:
            self._result = 'aborted'
            print('Processing Request Rejected')

        self._driver.quit()
# import web driver
from selenium import webdriver
from parsel import Selector
import urllib
import os
import sched
import time
from selenium.webdriver.common.keys import Keys


OUTPUT_FOLDER = 'real_captcha_dataset'

# specifies the options to the chromedriver.exe
options = webdriver.ChromeOptions()
#options.add_argument('--headless')


url = 'https://www.tis.bizfile.gov.sg'
driver = webdriver.Chrome('/Users/merlinegalite/Desktop/octobot/Scraping/LinkedInScraping/chromedriver', options=options)


driver.get('https://www.bizfile.gov.sg/ngbbizfileinternet/faces/oracle/webcenter/portalapp/pages/BizfileHomepage.jspx?_afrWindowId=null&_afrLoop=11499874782621942&_afrWindowMode=0&_adf.ctrl-state=10irrn140w_4#%40%3F_afrWindowId%3Dnull%26_afrLoop%3D11499874782621942%26_afrWindowMode%3D0%26_adf.ctrl-state%3D2w324sfb3_4')

query_button = driver.find_element_by_xpath('//*[@class="search_Icon2 af_commandImageLink p_AFTextOnly"]')
query_button.click()

time.sleep(4)

sel = Selector(text=driver.page_source)

예제 #30
0
 def __init__(self):
     scrapy.Spider.__init__(self)
     options = webdriver.ChromeOptions()
     options.add_argument('headless')
     self.browser = webdriver.Chrome('chromedriver', chrome_options=options)