Пример #1
0
def deleteEmail(email, selection, system):
    driver = Chrome("chromedriver")
    driver.get(index_url)
    driver.find_element_by_id(selection).click()
    time.sleep(1)
    driver.find_element_by_id("email").send_keys(email)
    driver.find_element_by_id("password").send_keys(password)
    if (system is not None):
        driver.find_element_by_id(system).click()
    driver.find_element_by_id("signInButton").click()
    time.sleep(4)
    driver.switch_to.alert.accept()
    time.sleep(1)
    driver.switch_to.alert.accept()
    emailNum = driver.find_elements_by_class_name("deleteKey")
    for i in range(len(emailNum)):
        delete = driver.find_element_by_id("deleteKey0")
        delete.click()
        driver.switch_to.alert.accept()
        time.sleep(4)
        driver.switch_to.alert.accept()
    emailNum = driver.find_elements_by_class_name("deleteKey")
    if (len(emailNum) != 0):
        assert emailNum != 0, "Not all emails deleted"
    time.sleep(1)
    driver.close()
Пример #2
0
    def __init__(self):
        self.list_urls = []

        for area in neighbourhoods:
            try:
                webdriver = '/Users/RitaFigueiredo/Documents/chromedriver'
                driver = Chrome(webdriver)
                url = 'https://rightmove.co.uk'
                driver.get(url)
                search = WebDriverWait(driver, DELAY).until(
                    EC.presence_of_element_located((By.ID, 'searchLocation')))
                search.send_keys(area)
                rent = driver.find_element_by_id('rent')
                rent.click()
                time.sleep(1)

                if driver.find_element_by_id(
                        'locationIdentifier').is_displayed():
                    select_location = Select(
                        driver.find_element_by_id('locationIdentifier'))
                    select_location.select_by_index(0)

                select_min_bedrooms = Select(
                    driver.find_element_by_id('minBedrooms'))
                select_min_bedrooms.select_by_value('0')
                select_max_bedrooms = Select(
                    driver.find_element_by_id('maxBedrooms'))
                select_max_bedrooms.select_by_value('2')
                select_added_to_site = Select(
                    driver.find_element_by_id('maxDaysSinceAdded'))
                select_added_to_site.select_by_value('1')
                rent = driver.find_element_by_id('submit')
                rent.click()

                time.sleep(2)

                home_links = driver.find_elements_by_class_name(
                    'propertyCard-link')

                for link in home_links:
                    self.list_urls.append(link.get_attribute('href'))

                time.sleep(2)
                while True:
                    next_button = driver.find_element_by_class_name(
                        'pagination-button.pagination-direction.pagination-direction--next'
                    )

                    if not next_button.is_enabled():
                        break
                    next_button.click()
                    time.sleep(3)
                    home_links = driver.find_elements_by_class_name(
                        'propertyCard-link')
                    for link in home_links:
                        self.list_urls.append(link.get_attribute('href'))
                    time.sleep(3)
            except TimeoutException:
                print("Loading took too much time!")
            time.sleep(2)
Пример #3
0
class RecipeScrape:
    def __init__(self, recipe_link):
        self.recipe_link = recipe_link

        options = webdriver.ChromeOptions()
        if IS_HEADLESS_BROWSER:
            options.add_argument('--headless')

        prefs = {"profile.managed_default_content_settings.images": 2}
        options.add_experimental_option("prefs", prefs)

        options.add_argument("--disable-gpu")
        # options.add_argument("--no-sandbox")
        # options.add_argument("start-maximized")                                                                                
        # options.add_argument("enable-automation")
        options.add_argument("--disable-infobars")
        options.add_argument("--disable-dev-shm-usage")

        options.add_argument("--disable-popup-blocking")

        if IS_LOCAL_CHROMEDRIVER:
            self.driver = Chrome(WEBDRIVER_FILE, options=options)
        else:
            self.driver = Chrome(options=options)

    def scrape(self):
        try: 
            self.driver.get(self.recipe_link)
            print("URL opened: {}".format(self.recipe_link))

            if( "<body></body>" in self.driver.page_source ):
                raise InstanceIPBlacklistedException()

            self.determine_scrape_version()
            self.scrape_version.scrape(self.driver)
        except Exception:
            raise
        finally:
            self.driver.quit()

    def determine_scrape_version(self):
        # We use some random indicator on the page to determine, but may not be entirely robust; may want to find some other indicator later
        if( len(self.driver.find_elements_by_class_name("author-block")) > 0 ):
            self.scrape_version = ScrapeRecipeV1(MAX_REVIEW_SCRAPE_PER_RECIPE)
        elif( len(self.driver.find_elements_by_class_name("recipe-container-outer")) > 0 ):
            self.scrape_version = ScrapeRecipeV2(MAX_REVIEW_SCRAPE_PER_RECIPE)
        else:
            raise Exception("Invalid or unexpected version of recipe page")
Пример #4
0
def _get_product_image(chrome_driver: webdriver.Chrome) -> str:
    """
    Retrieve product image URL
    :param chrome_driver: chrome web driver instance
    :return: product image url
    """
    product_photos = chrome_driver.find_elements_by_class_name(
        config["PRODUCT_PHOTOS"])
    item = None

    if len(product_photos) != 1:
        product_photos.reverse()

    while product_photos and not item:
        for product_photo in product_photos:
            hover_to_photos(chrome_driver, product_photo)

        soup = BeautifulSoup(chrome_driver.page_source, "html.parser")
        item = soup.find(class_=config["PRODUCT_DETAIL_PHOTO"])

        if item:
            break
        else:
            product_photos.pop()

    item_image = item.get("style")

    if item_image:
        item_image = item_image.split(" ")[1].split('"')[1]
        return item_image
    return ""
Пример #5
0
def get_ad_info_from_link(url: str, browser: webdriver.Chrome):
    """
    With the link to ads page on Google Transparency Report (from the csv), get the assets, vis_url and description
    for the ads
    :param url:
    :param browser:
    :return:
    """
    browser.get(url)
    time.sleep(1.5)
    el_ad_ctnr = browser.find_elements_by_class_name('ad-container')
    if not el_ad_ctnr:
        return None

    soup_ad_ctnr = BeautifulSoup(el_ad_ctnr[0].get_attribute("innerHTML"),
                                 'html.parser')

    _all_divs = soup_ad_ctnr.find_all("div")

    if len(_all_divs) != 4:
        print("Malformed Webpage: {}".format(url))
        return None

    assets = _all_divs[0].get_text()
    url = _all_divs[1].get_text()
    url = url.split(" ")[1]
    desc = _all_divs[3].get_text()

    return assets, url, desc
Пример #6
0
def push_replay_button(driver: webdriver.Chrome):
    try:
        close_button = WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located((
                By.CSS_SELECTOR,
                "div.prize-overlay div.popup-box div.popup-header button.close-button",
            )))
        button_click(driver, close_button)
    except TimeoutException:
        pass

    try:
        checkbox_element = WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, "span.mc-checkmark")))
        button_click(driver, checkbox_element)

        main_button = driver.find_elements_by_class_name("main-button")[0]
        button_click(driver, main_button)
    except TimeoutException:
        pass

    try:
        custom_buttons = WebDriverWait(driver, 5).until(
            EC.visibility_of_all_elements_located(
                (By.CSS_SELECTOR, "button.custom-button")))
        custom_button = custom_buttons[-1]
        button_click(driver, custom_button)
    except TimeoutException:
        pass
Пример #7
0
def _sanity_checks(driver: webdriver.Chrome):
    elems = driver.find_elements_by_class_name("igc-legend-label")
    labels = [e.text for e in elems]
    if labels[0] != "Fået 1. vaccine" or labels[1] != "Modtaget 2. vaccine*":
        raise Exception(
            "First graph structure has changed. Consider manually checking the axis labels in the browser."
        )
Пример #8
0
def lookup_game_price(driver: webdriver.Chrome, game_name: str):
    driver.get(f"https://www.g2a.com/")
    wait_set_time_plus_random_time(2)
    cookie_clicker(driver)
    driver.get(
        f"https://www.g2a.com/search?query={game_name.replace(' ', '%2B')}")

    WebDriverWait(driver, 800).until(
        expected_conditions.visibility_of_element_located(
            (By.CLASS_NAME, "offers-list")))
    print("found offer site")

    offer_list_elements = driver.find_elements_by_class_name(
        "offers-list__element")
    for list_element in offer_list_elements:
        transactions = int(
            list_element.find_element_by_class_name(
                "seller-info__transactions").text)
        print(transactions)
        if transactions > min_transactions:
            offer_price = list_element.find_element_by_class_name(
                "offer__price").text
            print(offer_price)
            return offer_price
    wait_set_time_plus_random_time(10)
Пример #9
0
def on_button_clicked():
    browser = Chrome(options=opts)
    for word in sentence.split():
        data['word'] = []
        browser.get('https://www.megalobiz.com/lrc/maker/download-music-lyrics-lrc-generated-files')

        search_form = browser.find_element_by_id('lrc_search_phrase')
        search_form.send_keys(word)
        time.sleep(2)
        #results = browser.find_elements_by_class_name("lyrics_member_box")
        lyricsList = browser.find_element_by_class_name("more_content")
        for lyrics in lyricsList:
            lyrics.click()
            
        
        #print(results[0].text)       
        

        download_button = browser.find_elements_by_class_name("lyrics_button")
        download_button[0].click()
        time.sleep(1)
        listofFiles = glob.glob(os.path.join(lyricPath, "*"))
        latestFile = max(listofFiles, key=os.path.getctime)
        
        latestFile = latestFile.rsplit('\\', 1)[-1]
        print(latestFile)
Пример #10
0
def get_games_id(comp):

    dates = [d for d in date_range(START_DATE, END_DATE)]
    games_id = []

    chrome_options = Options()
    chrome_options.add_argument('--dns-prefetch-disable')
    driver = Chrome(chrome_options=chrome_options)

    for day in dates:
        driver.get(
            'http://www.espn.com.ar/futbol/resultados/_/liga/{}/fecha/{}'.
            format(comp, day)
        )

        game_link_driver = driver.find_elements_by_class_name(
            'mobileScoreboardLink  '
        )

        for game_driver in game_link_driver:
            game_id = game_driver.get_attribute('href')[46:53]
            games_id.append((game_id, day))

        driver.quit

    # print(games_id)
    return games_id
def insert_jd_target_sku(browser: Chrome):
    elements = browser.find_elements_by_class_name('gl-item')
    print(f'当前页面共有{len(elements)}个商品')
    for element in elements:
        # 获取当前商品SKU编号
        current_sku: str = element.get_attribute('data-sku')
        TargetSku.get_or_create(source='京东', sku=current_sku)
def scroll(driver: Chrome, wait: WebDriverWait):
    scroll_div = wait.until(
        EC.visibility_of_element_located((By.CLASS_NAME, SCROLLBOX)))
    reviews_count = len(driver.find_elements_by_class_name(REVIEW))
    timeout = time.time() + SCROLL_TIMEOUT

    while time.time() < timeout:
        driver.execute_script(SCROLLING_SCRIPT, scroll_div)
        time.sleep(1)

        scrolled_reviews_count = len(
            driver.find_elements_by_class_name(REVIEW))

        if reviews_count < scrolled_reviews_count:
            reviews_count = scrolled_reviews_count
            timeout = time.time() + SCROLL_TIMEOUT
Пример #13
0
def get_games_id(comp):

    dates = [d for d in date_range(START_DATE, END_DATE)]
    games_id = []

    chrome_options = Options()
    chrome_options.add_argument('--dns-prefetch-disable')
    driver = Chrome(chrome_options=chrome_options)

    for day in dates:
        driver.get(
            'http://www.espn.com.ar/futbol/resultados/_/liga/{}/fecha/{}'.
            format(comp, day))

        game_link_driver = driver.find_elements_by_class_name(
            'mobileScoreboardLink  ')

        for game_driver in game_link_driver:
            game_id = game_driver.get_attribute('href')[46:53]
            games_id.append((game_id, day))

        driver.quit

    # print(games_id)
    return games_id
Пример #14
0
def start_quiz(driver: webdriver.Chrome, status: int):
    try:
        checkbox_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, "span.mc-checkmark")))
        button_click(driver, checkbox_element)

        main_button = driver.find_elements_by_class_name("main-button")[0]
        button_click(driver, main_button)

        name_input = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located(
                (By.CSS_SELECTOR, "input.name-input")))
        button_click(driver, name_input)
        name_input.send_keys(get_random_word())
    except (NoSuchElementException, TimeoutException):
        pass

    if status > -3:
        try:
            custom_button = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located(
                    (By.CSS_SELECTOR, "button.custom-button")))
            button_click(driver, custom_button)
        except TimeoutException:
            pass
Пример #15
0
def test_item_has_add_to_cart_button(browser: webdriver.Chrome):
    browser.get(
        "http://selenium1py.pythonanywhere.com/catalogue/coders-at-work_207/")
    time.sleep(30)
    count_basket_btn = browser.find_elements_by_class_name(
        "btn-add-to-basket").__len__()
    assert count_basket_btn == 1, f"Expected 1 basket button, got {count_basket_btn}"
Пример #16
0
def set_bundle_data(driver: webdriver.Chrome, bundle: Bundle) -> List[str]:
    driver.get(bundle.url)
    try:
        bundle.name = driver.find_element_by_class_name('hero-title').text
        bundle.site = 'HumbleBundle'
        game_name_containers = driver.find_elements_by_class_name('dd-image-box-text')
        opts = Main.get_chromedriver_options()
        with managed_chromedriver(opts) as nested_driver:
            nested_driver.set_window_size(1800, 1070)
            nested_driver.implicitly_wait(2)
            for container in game_name_containers:
                game = Game()
                game.site = 'HumbleBundle'
                game.name = container.text
                game.review_count = SteamHandler.get_game_review_number(game.name, nested_driver)
                game.g2a_price = G2AHandler.get_price_of(game, nested_driver)
                bundle.games.append(game)
                print(f'G2A price of {game.name}: {str(game.g2a_price)}')
        bundle.sale_price = _get_total_sale_price(driver)
        for game in bundle.games:
            bundle.g2a_price = bundle.g2a_price + game.g2a_price
            bundle.after_commission_price = bundle.after_commission_price + Utility.calculate_net_price(
                game.g2a_price)
        Product.set_bundle_meta_data(bundle)
    except NoSuchElementException:
        logging.getLogger().info(f"Can't find games in bundle with url {bundle.url}, it might be a Humble Monthly")
Пример #17
0
class LoginTest(TestCase):
    def setUp(self):
        self.driver = Chrome()

    def tearDown(self):
        self.driver.close()

    def test_log_in_as_admin(self):
        self.driver.get('http://localhost:8000/admin/')
        self.driver.find_element_by_name('username').send_keys('Администратор')
        input = self.driver.find_element_by_name('password')
        input.send_keys('Админ_123')
        input.send_keys(Keys.ENTER)
        assert self.driver.current_url == 'http://localhost:8000/admin/'

    def test_login_error(self):
        self.driver.get('http://localhost:8000/admin/')
        self.driver.find_element_by_name('username').send_keys('А')
        input = self.driver.find_element_by_name('password')
        input.send_keys('А')
        input.send_keys(Keys.ENTER)
        assert self.driver.current_url == 'http://localhost:8000/admin/login/?next=/admin/'
        error = self.driver.find_elements_by_class_name('errornote')[0]
        assert error.text == "Пожалуйста, введите корректные имя пользователя и" \
                             " пароль учётной записи. Оба поля могут быть чувствительны к регистру."
Пример #18
0
def search_1(a, b, c):
    '''

    :param a: movie name
    :param b: empty list for future use
    :param c: empty dict for future use
    :return:
    '''
    browser = Chrome(options=opts)
    browser.get('https://duckduckgo.com')
    search_form = browser.find_element_by_id('search_form_input_homepage')
    search_form.send_keys(a + ' кинопоиск')
    search_form.submit()
    browser.find_element_by_id('r1-0').click()
    browser.find_element_by_class_name('styles_link__21QPj').click()
    results = browser.find_elements_by_class_name('gray')

    for r in results:
        r = r.text
        r = r.strip()
        r  = r.lower()
        if r == '':
            continue
        if r in b:
            continue
        else:
            b.append(r)
    browser.close()
    c[a] = b
    return c
Пример #19
0
 def _parse_boosters(self, driver: webdriver.Chrome) -> tuple:
     elems = driver.find_elements_by_class_name("number_revac_info")
     elem = [
         e for e in elems if "Всего" in e.find_element_by_xpath("..").text
     ][0]
     total_boosters = clean_count(elem.text)
     return total_boosters
Пример #20
0
def crawl(driver: webdriver.Chrome) -> List[Game]:
    driver.get('https://www.humblebundle.com/store/search?sort=bestselling&filter=onsale')
    game_list = []
    game_boxes = driver.find_elements_by_class_name('entity-link')
    game_list.append(_get_filled_game(game_boxes))
    counter = 0
    while True:
        counter += 1
        try:
            driver.get(f'https://www.humblebundle.com/store/search?sort=bestselling&filter=onsale&page={counter}')
            time.sleep(2)
            driver.find_element_by_class_name('entity-link')
            game_boxes = driver.find_elements_by_class_name('entity-link')
            game_list.append(_get_filled_game(game_boxes))
        except NoSuchElementException:
            logging.getLogger().info(f'Found {len(game_list)} games on {counter} pages on HumbleBundle')
            return game_list
Пример #21
0
def _get_total_sale_price(driver: webdriver.Chrome) -> Decimal:
    headlines = driver.find_elements_by_class_name('dd-header-headline')
    prices = []
    for headline in headlines:
        matches = re.findall('€\\d.?\\d?\\d?', headline.text)
        if len(matches) > 0:
            prices.append(Decimal(matches[0].replace('€', '')))
    return max(prices)
def scrape_sites():
    websites = pd.read_csv("lsa.csv")
    lsa_websites = []
    lsa_dict = {
        'website': [],
        'low': [],
        "medium": [],
        "high": [],
        "score": []
    }
    for website in websites["LSA Sites"]:
        lsa_websites.append(website)
        URL = f"https://www.accessi.org/?{website}"
        options = Options()
        options.headless = True
        driver = Chrome(chrome_options=options)
        driver.get(URL)
        sleep(15)
        high_score = 0
        high = 0
        medium_score = 0
        medium = 0
        low_score = 0
        low = 0
        cats = driver.find_elements_by_class_name("test-result-block")
        for i in cats:
            if i.get_attribute("style") == "":
                header = i.find_element_by_class_name("test-result-header")
                num_impacts = header.find_element_by_class_name(
                    "test-result-count")
                num_impacts = str(num_impacts.text)
                if "high" in num_impacts:
                    number = num_impacts.split(" ")[0]
                    high += int(number)
                    high_score += 3 * int(number)
                elif "medium" in num_impacts:
                    number = num_impacts.split(" ")[0]
                    medium += int(number)
                    medium_score += 2 * int(number)
                elif "low" in num_impacts:
                    number = num_impacts.split(" ")[0]
                    low += int(number)
                    low_score += int(number)
        lsa_dict["website"].append(website)
        lsa_dict["low"].append(low)
        lsa_dict["medium"].append(medium)
        lsa_dict["high"].append(high)
        lsa_dict["score"].append(medium_score + high_score)
        print(website)
        print("low:", low)
        print("medium:", medium)
        print("high:", high)
        print("final score:", medium_score + high_score)
        print()
        driver.close()

    df = pd.DataFrame(lsa_dict)
    df.to_csv("output.csv")
Пример #23
0
def insert_sn_target_sku(browser: Chrome):
    elements = browser.find_elements_by_class_name('item-wrap')
    print(f'当前页面共有{len(elements)}个商品')
    for element in elements:
        # 获取当前商品SKU编号
        content = element.get_attribute('id').split('-')
        shop_code = content[0]
        sku = content[1]
        SNTargetSku.get_or_create(shop_code=shop_code, sku=sku)
Пример #24
0
    def __init__(self, driver: webdriver.Chrome):

        print("")

        # Find pizza images by class name
        imagesTest = driver.find_elements_by_class_name("pizzaPictures")
        # For loop that loops through all the items in the div and prints their source
        for item in imagesTest:
            print(item.get_attribute("src"))
Пример #25
0
def main():
    webdriver = os.path.join(r"drive", "chromedriver")
    driver = Chrome(webdriver)

    url = "https://www.waytostay.com/paris-apartments/"
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    info = driver.find_elements_by_class_name("tile")
    prices = []
    details = []
    for j in range(len(info)):
        prices.append(
            driver.find_elements_by_class_name('price-person')[j].text)
        details.append(
            driver.find_elements_by_class_name('concise-details')[j].text)
    print(prices, details)
    driver.close()
    driver.quit()
Пример #26
0
def _find_right_game(game: Game, driver: webdriver.Chrome, search_query):
    time.sleep(2)
    product_grids = driver.find_elements_by_class_name('products-grid__item')
    for product_grid in product_grids:
        if _find_proper_card(product_grid, game.name, search_query):
            time.sleep(1)
            offers = driver.find_elements_by_class_name('offer')
            for offer in offers:
                if 'Official developer' not in offer.get_attribute(
                        'innerHTML'):
                    try:
                        rating_count = _get_g2a_rating_count(offer)
                    except ValueError:
                        continue
                    if game.review_count < 500:
                        return _get_price(offer)
                    if rating_count > 1000:
                        return _get_price(offer)
    return Decimal(0)
Пример #27
0
def parse_friends(driver: Chrome, user_link: str) -> List[User]:
    sleep(delay())
    driver.get(f"{user_link}/friends")
    driver.find_element_by_tag_name("body").send_keys(Keys.ESCAPE)
    scroll_page(driver, driver.execute_script("return document.body.scrollHeight"))

    friend_items = driver.find_elements_by_class_name("_698")
    friend_list = [User(friend_item) for friend_item in friend_items]

    return friend_list
Пример #28
0
def fetch():
    options = ChromeOptions()
    options.add_argument('headless')
    driver = Chrome(
        executable_path=
        "/Users/tuanthanhtran/Desktop/training/bug-bounty/hackerone-reports/chromedriver",
        options=options)

    reports = []
    with open('data.csv', 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            reports.append(dict(row))
    first_report_link = reports[0]['link']

    driver.get(hacktivity_url)
    driver.implicitly_wait(page_loading_timeout)

    counter = 0
    page = 0
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(page_loading_timeout)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            counter += 1
            if counter > 1:
                break
        else:
            counter = 0
        last_height = new_height

        raw_reports = driver.find_elements_by_class_name('fade')
        new_reports = extract_reports(raw_reports)
        found = False
        for i in range(len(new_reports)):
            if new_reports[i]['link'] == first_report_link:
                reports = new_reports[:i] + reports
                found = True
                break
        if found:
            break

        page += 1
        print('Page:', page)

    driver.close()

    with open('data.csv', 'w', newline='', encoding='utf-8') as file:
        keys = reports[0].keys()
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(reports)
Пример #29
0
    def test_one(self):
        browser = Chrome('/usr/local/bin/chromedriver')
        browser.get('https://duckduckgo.com')

        search_form = browser.find_element_by_id('search_form_input_homepage')
        search_form.send_keys('real python')
        search_form.submit()

        results = browser.find_elements_by_class_name('result')
        assert 'https://realpython.com' in results[0].text

        browser.close()
Пример #30
0
def parse_jobs(driver: Chrome, user_link: str) -> List[str]:
    sleep(delay())
    driver.get(user_link)
    driver.find_element_by_tag_name("body").send_keys(Keys.ESCAPE)  # закроем поп-ап с предложением об оповещениях
    driver.find_element(By.XPATH, "//a[@data-tab-key='about']").click()
    sleep(delay())
    driver.find_element(By.XPATH, "//a[@data-testid='nav_edu_work']").click()
    sleep(delay())
    experience_list = driver.find_elements_by_class_name("experience")
    jobs = [parse_job(item) for item in experience_list]

    return jobs
Пример #31
0
def test_search():
    browser = Chrome()
    browser.get('https://duckduckgo.com')

    search_form = browser.find_element_by_id('search_form_input_homepage')
    search_form.send_keys('real python')
    search_form.submit()
    results = browser.find_elements_by_class_name('result')
    first_result_text = results[0].text
    print(first_result_text)
    browser.close()
    assert first_result_text == 'Python Tutorials - Real Python\nPython "while" Loops (Indefinite Iteration) In this tutorial, you\'ll learn about indefinite iteration using the Python while loop. You\'ll be able to construct basic and complex while loops, interrupt loop execution with break and continue, use the else clause with a while loop, and deal with infinite loops.\nhttps://realpython.com'
Пример #32
0
def extract_publication(profile_url, verbose=verbose_citation_list):
    """
    this function crawl the publication list from the google scholar profile
    @param[in]      profile_url     the link of google scholar profile you want to crawl
    @param[in]      verbose         the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton
    @return         the list of pulication as a list, where each entry is a dictionary
    """
    # scholar's artical list
    browser=Browser()
    browser.get(profile_url)
    publication={}
    while True:
        publication_list=browser.find_elements_by_class_name('gsc_a_tr')
        for publication_item in publication_list:
            title=publication_item.find_element_by_class_name('gsc_a_at').text
            print title
            author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ')
            vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text
            try:
                citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text)
                link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href')
            except:
                citation=0
                link=None
            try:
                year=int(publication_item.find_element_by_class_name('gsc_a_h').text)
            except:
                year=None
            """
            # to get citation for every paper, but will be detected as robot
            if citation>0 and verbose>=verbose_citation_list:
                print 'and its citation list',
                # to solve anti-crawl, but not work
                # time.sleep(2)
                cited_by=extract_citation_for_publication(link)
            else:
                cited_by=None

            print 'finished'
            publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year}
            """
            publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': citation, 'year':year}
        if not next_page_new(browser):
            break
    browser.close()
    return publication
Пример #33
0
def extract_movies(max_page_num=5):
    browser = Browser()
    browser.get(URL)
    movies = {}
    while True:
        movie_list = browser.find_elements_by_class_name("item")
        for movie in movie_list:
            title = movie.find_element_by_tag_name("p").text.strip()
            rating = movie.find_element_by_tag_name("strong").text.strip()
            movies[title] = rating
        if max_page_num > 0:
            max_page_num -= 1
            if not have_more(browser):
                break
        else:
            break
    browser.close()
    return movies
Пример #34
0
def extract_hongren(max_page_num=5):
    suffix = "hongren"
    # 正常情况下,把driver文件所在路径加到Path环境变量里就可以了
    # 但是我这里不知道怎么回事就是不行,干脆放在代码所在目录下面了
    browser = Browser('chromedriver.exe')
    browser.get(BASE_URL + suffix)
    items = {}
    while True:
        item_list = browser.find_elements_by_class_name('wall_item')
        for item in item_list:
            href = item.find_element(By.CSS_SELECTOR, ".pic_box.pic").get_attribute("href")
            desc = item.find_elements_by_class_name("desc")[0].text.strip()
            items[href] = desc
        if max_page_num > 0:
            max_page_num -= 1
            if not scroll_to_next(browser):
                break
        else:
            break
    browser.close()
    return items