예제 #1
0
def chinahpo(hpo):
    # 如果使用IP池,则不进行随机等待
    # s = random.randint(5, 10)
    # print("等待 " + str(s) + "秒")
    # time.sleep(s)
    ip = randomIP()
    # ip = "socks5://127.0.0.1:1080"
    print("使用IP " + ip)
    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument("headless")
    # options.add_argument("disable-gpu")
    options.add_argument("--proxy-server={ip}".format(ip=ip))
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"

    driver = Edge(options=options, executable_path=msedge)
    script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    driver.execute_script(script)
    UA = randomUA()
    # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36"
    driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA})
    print(driver.execute_script("return navigator.userAgent;"))

    hpid = hpo.split(":")[1]
    url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(
        hpid=hpid)

    try:
        driver.get(url)
        strtemp = url
        print("网址:", strtemp)
    except Exception:
        print("get page error", hpo)

    time.sleep(2)
    with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f:
        f.write(str(driver.page_source))

    driver.close()
    fin = open("finish.txt", "a")
    fin.write(hpo + "\n")
    fin.close()
def img_download(url, path, count):
    # get all images
    chrome_driver_path = "D:\\programming\\Machine learning\\ml_projects\\google image scraper\\msedgedriver.exe"
    browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe"
    option = EdgeOptions()
    option.binary_location = browser_path
    driver = Edge(executable_path=chrome_driver_path, options=option)
    try:
        driver.get(url)
        #time.sleep(10)
        for __ in range(10):
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(.2)
        imgs = get_all_images(url, driver)
        for img in imgs:
            # for each img, download it
            count = download(img, path, count)
    except WebDriverException:
        print("page down")
    return count
예제 #3
0
파일: PyWaMG.py 프로젝트: megz15/PyWaMG
def wa_login(isHeadless=True):
    '''
    Use to login to Whatsapp Web
    
    Can omit usage if already logged in once by scanning QR

    Parameters
    ----------
    None

    Returns
    -------
    None
    '''
    options = EdgeOptions()
    options.use_chromium = True     #Uses chromium-based edgium, remove to use legacy edge
    options.add_argument("user-data-dir="+os.getcwd()+"\\Cache")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.49")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    # options.add_experimental_option("excludeSwitches", ["enable-automation"])
    # options.add_experimental_option("useAutomationExtension", False)
    options.headless = isHeadless   #Headless mode
    global driver
    driver = Edge(EdgeChromiumDriverManager().install(),options=options)
    driver.get('https://web.whatsapp.com/')
    if os.path.isfile('./Cache/wa.exists'):
        return
    else:
        pass
    wait_for_load('_1PTz1')
    driver.execute_script("""
    var element1 = document.querySelector("._3DgtU");
    var element2 = document.querySelector("._1iKcN");
    if (element1)
        element1.parentNode.removeChild(element1);
    if (element2)
        element2.parentNode.removeChild(element2);
    """)
    Image.open(BytesIO(driver.find_element_by_class_name('landing-main').screenshot_as_png)).show()
    with open('Cache/wa.exists','w') as file:
        pass
예제 #4
0
class Web_scraping:
    def __init__(self):
        '''Initialize the application'''
        #As using the standard webdriver was giving warnings and messing up the terminal, I used the code below to show just what I want.
        self.opt = EdgeOptions()
        self.opt.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.opt.add_argument("--start-maximized")
        self.opt.use_chromium = True
        self.driver = Edge(
            executable_path=
            r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe",
            options=self.opt)

    def games_link(self):
        '''Create a list with all season event's link and then create another list with all event's link'''
        #Creating list with the all season's link
        self.season_pages_list = []
        for y in range(2008, 2022):
            #Creating the seasons links as str and adding it to a list
            self.season_link = 'https://www.worldsurfleague.com/events/' + str(
                y) + '/mct?all=1'
            self.season_pages_list.append(self.season_link)

        #Creating a list with the all event's link from each season
        self.events_link_list = []
        for link in self.season_pages_list:
            self.driver.get(link)
            #Getting all the events links as selenium format
            self.event_links = self.driver.find_elements_by_xpath(
                '//a[@class="event-schedule-details__event-name"]')
            #Finding the class status completed is needed once it's possible to stop the process on it.
            self.event_status = self.driver.find_elements_by_xpath(
                '//span[@class="event-status event-status--completed"]')

            #Creating event's link list
            for i in range(0, len(self.event_status)):
                #Getting the links for each event as a str format
                self.link_attribute = self.event_links[i].get_attribute('href')
                self.events_link_list.append(self.link_attribute)

        with open('events.txt', 'w') as f:
            for item in self.events_link_list:
                f.write("%s\n" % item)

        print('FINISHED')

    #Getting data inside which event
    def event_stats(self):
        #TXT file with all events link to list
        self.events_link = [
            line[0]
            for line in pd.read_fwf('events.txt', header=None).values.tolist()
        ]

        #for link in self.events_link:
        self.driver.get(self.events_link[0])

        #list of all heats
        self.all_heats_lists = []

        while True:
            #Gets all the waves scores, athletes, nationalities and heats on the page as list.
            self.waves = self.driver.find_elements_by_xpath(
                '//*[@class="score"]')
            self.athletes = self.driver.find_elements_by_xpath(
                '//*[@class="athlete-name"]')
            self.nationalities = self.driver.find_elements_by_xpath(
                '//*[@class="athlete-country-flag"]')
            self.heat = self.driver.find_elements_by_xpath(
                '//*[@class="new-heat-hd-name"]')

            #Gets the round name
            self.round = self.driver.find_elements_by_xpath(
                '//*[@class="carousel-item is-selected"]')
            if len(self.round) == 0:
                self.round = self.driver.find_elements_by_xpath(
                    '//*[@class="carousel-item last is-selected"]')

            #Gets the number of surfers and heats on the round, such as the avg surfers per heat (must be 2 or 3)
            self.number_of_surfers = int(len(self.waves) / 18)

            #As the final round only has 1 heat, the find_element_by_class_name gets a 'WebDriver' element and not a list
            self.number_of_heats = len(self.heat)

            self.surfers_per_heat = int(self.number_of_surfers /
                                        self.number_of_heats)

            #there's a count to deduct 1 stage and gets the round name for each round.
            self.count = 0
            #Gets the stats for each heat
            self.heat_data = []
            for g in range(0, self.number_of_heats):
                #Page stats
                #Event stats
                self.event_turn = self.driver.find_element_by_class_name(
                    'event-meta-tour-info').text.split()[2][1:]
                self.event_period = self.driver.find_element_by_class_name(
                    'event-schedule__date-range').text
                self.event_name = self.driver.find_element_by_class_name(
                    'event-title').text.split('\n')[0]
                self.event_local = re.split(
                    r'(\d+)',
                    self.driver.find_element_by_class_name(
                        'event-meta-tour-info').text)[2]
                self.avg_wave_score = re.split(
                    r'(\d+\.\d+)',
                    self.driver.find_element_by_class_name(
                        'new-heat-hd-status').text)[1]

                #Heat's id for the database
                self.heat_id = (f'heat{g + 1}' + self.round[0].text +
                                self.event_turn +
                                self.event_period[-4:]).lower()

                #Surfer stats

                self.surfer1 = self.athletes[g * 2].text
                self.surfer1_nat = self.nationalities[g *
                                                      2].get_attribute('title')

                self.surfer1_best_w1 = self.waves[g * 18 + (1 - 1)].text
                self.surfer1_best_w2 = self.waves[g * 18 + (2 - 1)].text
                self.surfer1_total = self.waves[g * 18 + (3 - 1)].text
                self.surfer1_w01 = self.waves[g * 18 + (4 - 1)].text
                self.surfer1_w02 = self.waves[g * 18 + (5 - 1)].text
                self.surfer1_w03 = self.waves[g * 18 + (6 - 1)].text
                self.surfer1_w04 = self.waves[g * 18 + (7 - 1)].text
                self.surfer1_w05 = self.waves[g * 18 + (8 - 1)].text
                self.surfer1_w06 = self.waves[g * 18 + (9 - 1)].text
                self.surfer1_w07 = self.waves[g * 18 + (10 - 1)].text
                self.surfer1_w08 = self.waves[g * 18 + (11 - 1)].text
                self.surfer1_w09 = self.waves[g * 18 + (12 - 1)].text
                self.surfer1_w10 = self.waves[g * 18 + (13 - 1)].text
                self.surfer1_w11 = self.waves[g * 18 + (14 - 1)].text
                self.surfer1_w12 = self.waves[g * 18 + (15 - 1)].text
                self.surfer1_w13 = self.waves[g * 18 + (16 - 1)].text
                self.surfer1_w14 = self.waves[g * 18 + (17 - 1)].text
                self.surfer1_w15 = self.waves[g * 18 + (18 - 1)].text

                #Surfer 2 stats
                self.surfer2 = self.athletes[g * 2 + 1].text
                self.surfer2_nat = self.nationalities[g * 2 +
                                                      1].get_attribute('title')

                self.surfer2_best_w1 = self.waves[g * 18 + (19 - 1)].text
                self.surfer2_best_w2 = self.waves[g * 18 + (20 - 1)].text
                self.surfer2_total = self.waves[g * 18 + (21 - 1)].text
                self.surfer2_w01 = self.waves[g * 18 + (22 - 1)].text
                self.surfer2_w02 = self.waves[g * 18 + (23 - 1)].text
                self.surfer2_w03 = self.waves[g * 18 + (24 - 1)].text
                self.surfer2_w04 = self.waves[g * 18 + (25 - 1)].text
                self.surfer2_w05 = self.waves[g * 18 + (26 - 1)].text
                self.surfer2_w06 = self.waves[g * 18 + (27 - 1)].text
                self.surfer2_w07 = self.waves[g * 18 + (28 - 1)].text
                self.surfer2_w08 = self.waves[g * 18 + (29 - 1)].text
                self.surfer2_w09 = self.waves[g * 18 + (30 - 1)].text
                self.surfer2_w10 = self.waves[g * 18 + (31 - 1)].text
                self.surfer2_w11 = self.waves[g * 18 + (32 - 1)].text
                self.surfer2_w12 = self.waves[g * 18 + (33 - 1)].text
                self.surfer2_w13 = self.waves[g * 18 + (34 - 1)].text
                self.surfer2_w14 = self.waves[g * 18 + (35 - 1)].text
                self.surfer2_w15 = self.waves[g * 18 + (36 - 1)].text

                #Inputing all variables into the heat_data list
                self.heat_data.append(self.heat_id)
                self.heat_data.append(self.event_name)
                self.heat_data.append(self.event_local)
                self.heat_data.append(self.event_turn)
                self.heat_data.append(self.event_period)
                self.heat_data.append(self.avg_wave_score)
                self.heat_data.append(self.surfer1)
                self.heat_data.append(self.surfer1_nat)
                self.heat_data.append(self.surfer1_best_w1)
                self.heat_data.append(self.surfer1_best_w2)
                self.heat_data.append(self.surfer1_total)
                self.heat_data.append(self.surfer1_w01)
                self.heat_data.append(self.surfer1_w02)
                self.heat_data.append(self.surfer1_w03)
                self.heat_data.append(self.surfer1_w04)
                self.heat_data.append(self.surfer1_w05)
                self.heat_data.append(self.surfer1_w06)
                self.heat_data.append(self.surfer1_w07)
                self.heat_data.append(self.surfer1_w08)
                self.heat_data.append(self.surfer1_w09)
                self.heat_data.append(self.surfer1_w10)
                self.heat_data.append(self.surfer1_w11)
                self.heat_data.append(self.surfer1_w12)
                self.heat_data.append(self.surfer1_w13)
                self.heat_data.append(self.surfer1_w14)
                self.heat_data.append(self.surfer1_w15)
                self.heat_data.append(self.surfer2)
                self.heat_data.append(self.surfer2_nat)
                self.heat_data.append(self.surfer2_best_w1)
                self.heat_data.append(self.surfer2_best_w2)
                self.heat_data.append(self.surfer2_total)
                self.heat_data.append(self.surfer2_w01)
                self.heat_data.append(self.surfer2_w02)
                self.heat_data.append(self.surfer2_w03)
                self.heat_data.append(self.surfer2_w04)
                self.heat_data.append(self.surfer2_w05)
                self.heat_data.append(self.surfer2_w06)
                self.heat_data.append(self.surfer2_w07)
                self.heat_data.append(self.surfer2_w08)
                self.heat_data.append(self.surfer2_w09)
                self.heat_data.append(self.surfer2_w10)
                self.heat_data.append(self.surfer2_w11)
                self.heat_data.append(self.surfer2_w12)
                self.heat_data.append(self.surfer2_w13)
                self.heat_data.append(self.surfer2_w14)
                self.heat_data.append(self.surfer2_w15)
                self.all_heats_lists.append(self.heat_data.copy())
                self.heat_data.clear()

            #Click on the previous round botton
            print(self.all_heats_lists)
            try:
                self.prev_round_bt = self.driver.find_element_by_xpath(
                    '//*[@class="flickity-button-icon"]').click()
            except:
                self.prev_round_bt = self.driver.find_element_by_xpath(
                    '//*[@class="flickity-button-icon"]')
                self.driver.execute_script("arguments[0].scrollIntoView();",
                                           self.prev_round_bt)
                time.sleep(.5)
                self.prev_round_bt.click()
            time.sleep(2.5)
예제 #5
0
class Session:
    def __init__(self, username, password, sleep_time=2):
        self.username = username
        self.password = password
        self.sleep_time = sleep_time
        options = EdgeOptions()
        options.use_chromium = True
        self.driver = Edge(options=options)

    def login(self):
        self.driver.get("https://www.twitter.com/login")
        sleep(self.sleep_time)
        u_name = self.driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        u_name.send_keys(self.username)
        p_word = self.driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        p_word.send_keys(self.password)
        p_word.send_keys(Keys.RETURN)
        sleep(self.sleep_time)

    def tweet_selection(self, search_str, csv_tit, max_tweets=300):
        sleep(self.sleep_time)
        search_input = self.driver.find_element_by_xpath(
            '//input[@aria-label="Search query"]')
        search_input.clear()
        search_input.send_keys(search_str)
        search_input.send_keys(Keys.RETURN)
        sleep(self.sleep_time)
        data = []
        tweet_ids = set()
        last_pos = self.driver.execute_script("return window.pageYOffset;")
        scrolling = True
        while scrolling:
            cards = self.driver.find_elements_by_xpath(
                '//div[@data-testid="tweet"]')
            for card in cards[-15:]:
                tweet = self.get_tweet_data(card)
                if tweet:
                    tweet_id = ''.join(tweet)
                    if tweet_id not in tweet_ids:
                        tweet_ids.add(tweet_id)
                        data.append(tweet)
            scroll_attempt = 0
            while True:
                self.driver.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight);')
                sleep(self.sleep_time)
                curr_pos = self.driver.execute_script(
                    "return window.pageYOffset;")
                if last_pos == curr_pos:
                    scroll_attempt += 1
                    if scroll_attempt >= 3:
                        scrolling = False
                        break
                    else:
                        sleep(2 * self.sleep_time)
                else:
                    last_pos = curr_pos
                    break
        with open(csv_tit, 'w', encoding="utf-8") as out:
            csv_out = csv.writer(out)
            csv_out.writerow([
                'user', 'date', 'text', 'quoting', 'reply count',
                'retweet count', 'like count'
            ])
            for row in data:
                csv_out.writerow(row)

    def get_tweet_data(self, card):
        user = card.find_element_by_xpath('.//span[contains(text(),"@")]').text
        try:
            date = card.find_element_by_xpath('.//time').get_attribute(
                'datetime')
        except NoSuchElementException:
            return
        text = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
        responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
        reply_count = card.find_element_by_xpath(
            './/div[@data-testid="reply"]').text
        retweet_count = card.find_element_by_xpath(
            './/div[@data-testid="retweet"]').text
        like_count = card.find_element_by_xpath(
            './/div[@data-testid="like"]').text
        tweet = (user, date, text, responding, reply_count, retweet_count,
                 like_count)
        return tweet

    def tweet(self, tuit):  # REQUIERE INTERACTUAR CON EDGE
        sleep(self.sleep_time)
        tuit_input = self.driver.find_element_by_xpath(
            '//div[@data-testid="tweetTextarea_0"]')
        tuit_input.clear()
        tuit_input.send_keys(tuit)
예제 #6
0
파일: nvidia.py 프로젝트: justTran/3080bot
class Nvidia():
    def __init__(self):
        self.api = 'https://api-prod.nvidia.com/direct-sales-shop/DR/products/en_us/USD/5438481700'  #nvidia-api
        self.debug = 'https://jsonplaceholder.typicode.com/todos/2'
        self.options = EdgeOptions()
        self.options.use_chromium = True
        self.options.add_argument(
            "--user-data-dir=C:\\Users\\Justin\\AppData\\Local\\Microsoft\\Edge\\User Data\\Profile 1"
        )  #Path to your chrome profile
        self.script = '''javascript:store.dispatch({type: actionTypes.ADD_ITEM_TO_CART,id: 5438481700,quantity: 1});
                        document.getElementsByClassName('nv-button js-checkout cart__checkout-button')[0].click();'''
        self.driver = Edge(executable_path=os.getcwd() + '/msedgedriver.exe')
        print(
            f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[94m[Browser]\033[00m Browser is open!'
        )
        self.wait = WebDriverWait(self.driver, 10)
        self.test()

    def test(self):
        print(
            f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[94m[Browser]\033[00m Going to NVIDIA'
        )
        self.driver.get(
            "https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3080/"
        )
        self.makeRequest()

    def purchaseScript(self):
        try:
            self.driver.execute_script(self.script)
            time.sleep(2)
            self.driver.get(
                'https://store.nvidia.com/store?Action=DisplayPage&Locale=en_US&SiteID=nvidia&id=QuickBuyCartPage'
            )
            print(
                f'{datetime.now().strftime("%H:%M:%S")} \033[92m[Status]\033[00m Bought'
            )
        except:
            print(
                f'{datetime.now().strftime("%H:%M:%S")} \033[92m[Status]\033[00m JavaScript Error'
            )

    def makeRequest(self):
        while (1):
            r = requests.get(self.api)  #CHANGE TO API
            if r.status_code != 200:
                print(
                    f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[92m[Status]\033[00m \033[96mAPI Cooldown.... waiting 30 seconds\033[00m'
                )
                time.sleep(30)
            else:

                if r.json()["InventoryStatus"][
                        "status"] == 'PRODUCT_INVENTORY_OUT_OF_STOCK':
                    print(
                        f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[92m[Status]\033[00m \033[91mOUT OF STOCK\033[00m'
                    )
                    time.sleep(1.5)

                else:
                    self.purchaseScript()
                    break
class ChannelScrape:
    """
    Constructors:
    __init__()


    Methods:

    toFile(), getUpcomingId(), getLiveId()
    """
    options_edge = EdgeOptions()
    options_edge.use_chromium = True
    options_edge.add_argument('--ignore-certificate-errors')
    options_edge.add_argument('--ignore-ssl-errors')
    options_edge.add_argument('--mute-audio')

    def __init__(self, channelId: str, headless=True, executable_path=None):
        # Searches for webdriver on each dir from PATH environment variables
        # Currently untested in linux
        if executable_path == None:
            for p in os.environ['PATH'].split(";"):
                if os.path.isfile(p + "\msedgedriver.exe"):
                    self.path_dir = p + "\msedgedriver.exe"

        # Setup driver
        self.options_edge.headless = headless
        self.driver = Edge(options=self.options_edge,
                           executable_path=self.path_dir)

        # JSON collecting process
        url = 'https://www.youtube.com/channel/' + channelId
        self.driver.get(url)
        self.jsonData = self.driver.execute_script('return ytInitialData')
        self.driver.quit()

    def toFile(self, output_file: str):
        """
        Output the collected json data to a file
        output_file: Output file name. File extension will be added automatically
        """
        with codecs.open(output_file + '.json', 'w',
                         encoding='utf-8') as jsonFile:
            json.dump(self.jsonData, jsonFile, ensure_ascii=False, indent=1)

    def getUpcomingId(self, dayDelta=14):
        """
        Returns a list of upcoming livestream(s) video ID
        dayDelta: If the upcoming livestream delta is more than the provided argument,
                the livestream Id will not be added to the return list 
        """

        # Personal note:
        # The base for calculating dates is 31-12-1969 (UNIX epoch time)
        # Which is then counted to the used date by seconds
        dateFilter = timedelta(days=dayDelta)
        dateThreshold = datetime.now() + dateFilter
        collectedContents = []
        try:
            content = self.jsonData['contents'][
                'twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer'][
                    'content']['sectionListRenderer']['contents'][1][
                        'itemSectionRenderer']['contents'][0]['shelfRenderer'][
                            'content']
        except:
            print(
                'Index out of range (Most likely channel only have horizontal grid renderer)'
            )
            return collectedContents
        # Only one upcoming livestream
        # This shouldn't need to use for loop assuming that there is always one item in items key
        # But items is still an array, so just in case
        if "expandedShelfContentsRenderer" in content:
            for item in content['expandedShelfContentsRenderer']['items']:
                liveDateEpoch = int(
                    item['videoRenderer']['upcomingEventData']['startTime'])
                liveDate = datetime.fromtimestamp(mktime(
                    gmtime(liveDateEpoch)))
                if item['videoRenderer']['thumbnailOverlays'][0][
                        'thumbnailOverlayTimeStatusRenderer'][
                            'style'] == "UPCOMING" and liveDate < dateThreshold:
                    collectedContents.append(item['videoRenderer']['videoId'])

        # Multiple upcoming livestreams
        elif "horizontalListRenderer" in content:
            for item in content['horizontalListRenderer']['items']:
                if 'upcomingEventData' in item['gridVideoRenderer']:
                    liveDateEpoch = int(item['gridVideoRenderer']
                                        ['upcomingEventData']['startTime'])
                    liveDate = datetime.fromtimestamp(
                        mktime(gmtime(liveDateEpoch)))
                    if item['gridVideoRenderer']['thumbnailOverlays'][0][
                            'thumbnailOverlayTimeStatusRenderer'][
                                'style'] == "UPCOMING" and liveDate < dateThreshold:
                        collectedContents.append(
                            item['gridVideoRenderer']['videoId'])

        return collectedContents

    def getLiveId(self):
        # Returns a list of the current livestreams video Id, if any
        # It is unlikely that there are multiple livestreams in the same channel,
        # but the possibility is there, therefore it returns a list instead of a single item

        content = self.jsonData['contents']['twoColumnBrowseResultsRenderer'][
            'tabs'][0]['tabRenderer']['content']['sectionListRenderer'][
                'contents'][0]['itemSectionRenderer']['contents'][0]
        collectedContents = []
        if "channelFeaturedContentRenderer" in content:
            for videoItem in content['channelFeaturedContentRenderer'][
                    'items']:
                if videoItem['videoRenderer']['thumbnailOverlays'][0][
                        'thumbnailOverlayTimeStatusRenderer'][
                            'style'] == "LIVE":
                    collectedContents.append(
                        videoItem['videoRenderer']['videoId'])

        return collectedContents
예제 #8
0
def chooseAccount():
    with open('data.txt') as json_file:
        data = json.load(json_file)

    userInfo ='account: ' + data['username']
    print(userInfo) 
    
    userName = data['username']
    passWord = data['password']
    print("link:")
    link = input()
    print("number of photos: ")
    amount = input()

    # format text and amount
    amount = int(amount)

    # auto login
    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument('headless')
    driver = Edge('msedgedriver', options = options)
    driver.get(link)
    time.sleep(2)
    userForm = driver.find_element_by_css_selector("input[name='username']")
    passForm = driver.find_element_by_css_selector("input[name='password']")
    userForm.send_keys(userName)
    passForm.send_keys(passWord)
    driver.find_element_by_css_selector("button[type='submit']").click()
    time.sleep(3)
    driver.execute_script("document.querySelector('.sqdOP.yWX7d.y3zKF').click()")

    # get link image to list
    time.sleep(2)
    if amount > 1: 
        spriteBtn = driver.find_element_by_css_selector(".coreSpriteRightChevron")
    list_link = []
    def get_url1():
        list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']")
        for image in list_element[:1]:
            src = image.get_attribute("src")
            list_link.append(src)
    def get_url2():
        list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']")
        list_element.pop(0)
        for image in list_element[:1]:
            src = image.get_attribute("src")
            list_link.append(src)

    for x in range(0, amount+1):
        if (len(list_link) > 0):
            get_url2()
        else:
            get_url1()
        if len(list_link) == amount:
            break
        elif spriteBtn:
            spriteBtn.click()
        else:
            break
        time.sleep(0.5)

    # check old image folder exist
    if (os.path.isdir("./image")): 
        rmtree("./image")

    # create new image folder
    folderPath = os.getcwd()
    folderPath += '\image'
    os.mkdir(folderPath)

    # clear screen
    clear = lambda: os.system('cls')
    clear()

    for i in tqdm(range(100)):
        pass

    print("\nnumber of photos:", len(list_link))

    pos = 0
    for href in list_link:
        print(pos+1, "DONE")
        imagePathResult = "./image/image_" + str(pos) + ".png"
        try:
            downloadFile(href)
            copy("./image/image.png", imagePathResult)
        except:
            print("error at %s" %pos+1)
        pos += 1
    os.remove("./image/image.png")

    resultPath = os.getcwd()
    resultPath = resultPath + '\image'
    os.startfile(resultPath)
    
    driver.close()
    chooseMenu()
    if (os.path.isfile(path)):
        key = 2
    else:
        key = 1
    menu(key)
예제 #9
0
class QCourse:
    def __init__(self):
        # 初始化options
        self.prefs = {"download.default_directory": os.getcwd()}
        self.options = EdgeOptions()
        self.options.use_chromium = True
        self.options.add_argument("log-level=3")
        self.options.add_experimental_option('excludeSwitches',
                                             ['enable-logging'])
        self.options.add_experimental_option('prefs', self.prefs)
        self.options.add_argument("--mute-audio")

        self.login_url = 'https://ke.qq.com/'

        self.driver = Edge(executable_path='msedgedriver.exe',
                           options=self.options)

    def login(self):
        self.driver.get('https://ke.qq.com/')
        self.driver.find_element_by_id('js_login').click()
        time.sleep(1)

        WebDriverWait(self.driver, 300).until_not(
            EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask')))

        dictCookies = self.driver.get_cookies()
        jsonCookies = json.dumps(dictCookies)
        with open('cookies.json', 'w') as f:
            f.write(jsonCookies)
        print('登陆成功!')

    def close(self):
        self.driver.close()

    def get_video(self, video_url=None, path=None):
        if not video_url:
            print('请输入视频url!')
        # os.chdir(BASE_DIR)
        if not os.path.exists('cookies.json'):
            self.login()
        with open('cookies.json', 'r') as f:
            listCookies = json.loads(f.read())
        self.driver.get(video_url)
        for cookie in listCookies:
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': cookie['httpOnly'],
                'name': cookie['name'],
                'path': '/',
                'secure': cookie['secure'],
                'value': cookie['value']
            })
        self.driver.get(video_url)
        # 等待视频开始播放
        WebDriverWait(self.driver, 300).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'loki-time')))
        WebDriverWait(
            self.driver,
            300).until_not(lambda driver: driver.find_element_by_class_name(
                'loki-time').get_attribute("innerHTML") == '00:00 / 00:00')

        networks = self.driver.execute_script(
            'return window.performance.getEntries()')
        ts_url = key_url = ''
        for network in networks:
            if '.ts?start' in network.get('name'):
                ts_url = network.get('name')
            elif 'get_dk' in network.get('name'):
                key_url = network.get('name')
        title = self.driver.title
        # catalog = self.driver.execute_script('return document.getElementsByClassName("task-item task-info active")'
        #                                      '[0].parentNode.firstElementChild.innerText')
        # os.chdir(os.path.join(os.getcwd(), catalog))
        download_single(ts_url, key_url, title, path)
예제 #10
0
     pingms=round(pingms,2)+1
except:
     pingms=2

print("-Calculated action delay: "+str(pingms))
# Maximize window for no real reason
browser.maximize_window()

for i in range(int(nb)):
     passed=passed+1
     try:
          # Open browser.
          if firstTime == True:
               browser.get("https://kahoot.it/")
          else:
               browser.execute_script("window.open('');")
               browser.switch_to.window(browser.window_handles[total])
               browser.get("https://kahoot.it/")
          #if firstTime == False:
          #     wait=WebDriverWait(browser, 3)
          #     alert=wait.until(EC.alert_is_present())
          #     alert.accept()
          time.sleep((pingms/2))
          # Find game id element and enter game code.
          search=browser.find_element_by_name("gameId")
          search.click()
          search.send_keys(qp)
          search.send_keys(Keys.RETURN)
          print("-Joined Game")
          print("-Entering name option")
          # Wait for browser to catch up. Edit equation later.
예제 #11
0
def chinahpo(hpo_queue):

    while hpo_queue.empty() is not True:
        hpo = hpo_queue.get()

        # 如果使用IP池,则不进行随机等待
        s = random.randint(5, 10)
        print(hpo, "等待 " + str(s) + "秒")
        time.sleep(s)
        ip = randomIP()
        # ip = "socks5://127.0.0.1:1080"
        hpo_ip = hpo + "\t" + ip
        print(hpo_ip)
        options = EdgeOptions()
        options.use_chromium = True
        options.add_argument("headless")
        # options.add_argument("disable-gpu")
        options.add_argument("--proxy-server=http://{ip}".format(ip=ip))
        options.add_argument("--disable-blink-features")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_experimental_option("excludeSwitches",
                                        ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)

        geo = get_timezone_geolocation(ip)
        print(geo)
        geo_json = {"latitude": geo[1], "longitude": geo[2], "accuracy": 1}
        timezone = {"timezoneId": geo[0]}

        preferences = {
            "webrtc.ip_handling_policy": "disable_non_proxied_udp",
            "webrtc.multiple_routes_enabled": False,
            "webrtc.nonproxied_udp_enabled": False
        }
        options.add_experimental_option("prefs", preferences)

        msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"

        driver = Edge(options=options, executable_path=msedge)
        script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
        driver.execute_script(script)
        UA = UserAgent().random
        # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36"
        driver.execute_cdp_cmd("Network.setUserAgentOverride",
                               {"userAgent": UA})
        driver.execute_cdp_cmd("Emulation.setGeolocationOverride", geo_json)
        driver.execute_cdp_cmd("Emulation.setTimezoneOverride", timezone)

        print(driver.execute_script("return navigator.userAgent;"))

        hpid = hpo.split(":")[1]
        url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(
            hpid=hpid)

        try:
            driver.get(url)
            strtemp = url
            print("网址:", strtemp)
        except Exception:
            print("get page error", hpo)

        time.sleep(2)
        with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f:
            f.write(str(driver.page_source))

        driver.close()
        fin = open("finish.txt", "a")
        fin.write(hpo + "\n")
        fin.close()

        size = getDocSize("html2/hp_" + hpid + ".html")
        if 9000 <= size <= 15000:
            checkIP = open("ip_check_better.txt", "a")
            checkIP.write(hpo_ip + "\n")
            checkIP.close()
예제 #12
0
def main():
    args = sys.argv
    f = open(args[4], "r")
    Lines = f.readlines()
    names, profession, nationality, job = [], [], [], []
    for line in Lines:
        array = line.split(",")
        names.append(array[0])
        profession.append(array[1])
        nationality.append(array[2])
        job.append(array[3].replace("\n", ""))
    for name in names:
        print("Query:", name, ".\nProcessing...")
        user = '******'
        search_term = f'{name} filter:verified'
        options = EdgeOptions()
        options.use_chromium = True
        driver = Edge(options=options)
        driver.get('https://www.twitter.com/login')
        driver.maximize_window()
        sleep(2)
        username = driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        username.send_keys(user)
        password = driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        password.send_keys('donkey123')
        password.send_keys(Keys.RETURN)
        sleep(1)
        search_input = driver.find_element_by_xpath(
            '//input[@aria-label="Search query"]')
        search_input.send_keys(search_term)
        search_input.send_keys(Keys.RETURN)
        sleep(1)
        driver.find_element_by_link_text('People').click()
        sleep(3)
        driver.find_element_by_xpath(
            '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]'
        ).click()
        sleep(3)
        data = []
        tweet_data = []
        start = 0
        end = 500
        for i in range(0, 5):
            sleep(1)
            cards = driver.find_elements_by_xpath(
                '//div[@data-testid="tweet"]')
            card = cards[i]
            tweet = get_tweet_data(card)
            for card in cards:
                data = get_tweet_data(card)
                if data:
                    tweet_data.append(data)
            driver.execute_script(f'window.scrollTo({start},{end});')
            start += 500
            end += 500
        driver.close()
        tweets = set(tweet_data)
        write_to_csv(name, tweets)
        df = pd.read_csv(f'{name}.csv')
        Twitter_sentiment = Twitter_sentiment_model(df)
        Twitter_toxic = Twitter_toxic_model(df)
        Big5 = Big5_model(df)

        create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
예제 #13
0
def SeleniumPapago(text=None):
    pptime = time()
    webElements = {
        "input-textbox": ["id", "txtSource"],
        "output-textbox": ["id", "txtTarget"],
        "translate-btn": ["id", "btnTranslate"],
    }

    def find_webelement(element):
        webElement = driver.find_element(element[0], element[1])
        return webElement

    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument("headless")
    options.add_argument("disable-gpu")
    options.add_argument("lang=ko_KR")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
    )

    driver = Edge(options=options)
    driver.get(f"https://papago.naver.com/?sk={srclang}&tk={tarlang}")

    ptime = time()

    try:
        wait = WebDriverWait(driver, timeout=15)
        wait.until(
            ec.visibility_of_element_located(webElements["translate-btn"]))
    except:
        # Waits for 15 seconds before Timeout
        exit("Webpage Timed out!")

    logging.info(f"time {time()-ptime}")

    input_textbox = find_webelement(webElements["input-textbox"])
    output_textbox = find_webelement(webElements["output-textbox"])
    translate_button = find_webelement(webElements["translate-btn"])

    source = text

    # Creates a JavaScript to input the batch to the WebElement input_textbox
    script = "var ele = " + repr(
        source) + ";" + "\n document.getElementById('txtSource').value=ele;"
    driver.execute_script(script)
    input_textbox.send_keys(" ")

    sleep(0.1)
    translate_button.click()

    translated = ""
    wait = 0
    while True:
        if wait == 4:
            wait = 0
        sleep(0.05)

        translated = output_textbox.text
        logging.info(f"waiting{wait*'.'} ,{source}")
        wait += 1
        if not (translated == "" or translated == None or translated == " "):
            logging.info(translated)
            break

    return translated
예제 #14
0
    #img_str = base64.b64encode(buffered.getvalue())
    images_women.append(src)

driver.get(url_names)

txt_box = driver.find_element_by_xpath('//*[@id="main"]/div/form/input[3]')
txt_box.clear()
txt_box.send_keys("95")

# men
select = Select(driver.find_element_by_xpath('//*[@id="gender"]'))
select.select_by_visible_text('male')
time.sleep(2)
driver.find_element_by_xpath('//*[@id="qc-cmp2-ui"]/div[2]/div/button[2]').click()
time.sleep(3)
driver.execute_script("window.scrollTo(0, 1080)") 
driver.find_element_by_xpath('//*[@id="main"]/div/form/input[4]').click()
time.sleep(5)
names = driver.find_elements_by_class_name('name_heading')
for name in names:
    names_men.append(name.text)

# women
select = Select(driver.find_element_by_xpath('//*[@id="gender"]'))
select.select_by_visible_text('female')
time.sleep(2)
actions = ActionChains(driver)
element = driver.find_element_by_xpath('//*[@id="main"]/div/form/input[4]')
actions.move_to_element(element).click().perform()
time.sleep(5)
names = driver.find_elements_by_class_name('name_heading')
    print("Getting atlas")
    # get opportunity atlas
    driver.get("https://opportunityatlas.org/")

    time.sleep(10)

    # click on get started atlas
    get_started = driver.find_element_by_xpath(
        "/html/body/div[3]/div[2]/div[1]/p[5]/button")
    get_started.click()

    # find element
    element = driver.find_element_by_id('introductionDialog')
    driver.execute_script(
        """
		var element = arguments[0];
		element.parentNode.removeChild(element);
		""", element)

    search_box = driver.find_element_by_xpath(
        '/html/body/div[2]/div[1]/div[2]/div[3]/div[2]/input')
    search_box.clear()
    search_box.send_keys(tract_id)
    time.sleep(2)
    search_box.send_keys(Keys.ENTER)
    time.sleep(10)

    driver.save_screenshot('test.png')

    #with io.BytesIO(driver.get_screenshot_as_png()) as f:
    #    f.write('./test.png')
예제 #16
0
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
import time
import random

options = EdgeOptions()
options.use_chromium = True



word_list=["stack overflow how to capture traffic",    "java display thread",  "p2p network"]
driver = Edge(options=options)
driver.get("https://www.google.com/")
driver.maximize_window()
time.sleep(1)
inputElement = driver.find_element_by_id("zV9nZe")
inputElement.click()
inputElement = driver.find_element_by_xpath("/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input")
inputElement.send_keys(random.choice(word_list), Keys.ENTER)

element = driver.find_element_by_xpath("/html/body/div[7]/div/div[9]/div[1]/div/div[2]/div[2]/div/div/div/div[1]/div/div[1]/a")
time.sleep(1)
driver.get(element.get_attribute('href'))
time.sleep(1)
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

time.sleep(4)
driver.close()

예제 #17
0
import random

options = EdgeOptions()
options.use_chromium = True
options.add_argument("headless")
# options.add_argument("disable-gpu")
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"

driver = Edge(options=options, executable_path=msedge)
script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
driver.execute_script(script)

url = "https://www.deciphergenomics.org/genes"

driver.get(url)
print("网址:", url)
# 等待加载
time.sleep(40)

# 定位下拉选择框并选择100
driver.find_element_by_xpath(
    '//*[@id="content"]/div/div/div[2]/div/div/div[2]/div/div[1]/div/label/select/option[@value="100"]'
).click()
time.sleep(10)

# 保存第一页
예제 #18
0
def download(url):
    options = EdgeOptions()
    options.use_chromium =True
    # option = webdriver.ChromeOptions()
    # option.add_argument('headless')
    options.add_argument('log-level=3')
    driver = Edge(options=options)
    # driver = webdriver.Chrome(
    #     executable_path='.//chromedriver', chrome_options=option)

    title = "output"
    try:
        driver.set_page_load_timeout(15)
        driver.get(url)
        title = driver.title
    except:
        print("Timeout - start download anyway.")

    print(f'道客巴巴: 《{title}》')
    time.sleep(5)

    try:
        # 展开全部
        elem_cont_button = driver.find_element_by_id("continueButton")
        driver.execute_script(
            "arguments[0].scrollIntoView(true);", elem_cont_button)
        actions = ActionChains(driver)
        actions.move_to_element(elem_cont_button).perform()
        time.sleep(0.5)
        elem_cont_button.click()
    except NoSuchElementException:
        pass

    # 获取页数
    num_of_pages = driver.find_element_by_id('readshop').find_element_by_class_name(
        'mainpart').find_element_by_class_name('shop3').find_element_by_class_name('text').get_attribute('innerHTML')
    num_of_pages = int(num_of_pages.split(' ')[-1])

    for i in range(5):
        # 缩放
        driver.find_element_by_id('zoomInButton').click()
        time.sleep(0.5)

    if os.path.exists(f'./temp/{title}'):
        shutil.rmtree(f'./temp/{title}')
    os.makedirs(f'./temp/{title}')

    for pages in trange(num_of_pages):
        time.sleep(0.5)

        canvas_id = "page_" + str(pages + 1)
        pagepb_id = "pagepb_" + str(pages + 1)

        element = driver.find_element_by_id(canvas_id)
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        actions = ActionChains(driver)
        actions.move_to_element(element).perform()
        time.sleep(0.5)

        # Check loading status
        while(len(driver.find_element_by_id(pagepb_id).get_attribute('innerHTML')) != 0):
            time.sleep(1)
            # print(driver.find_element_by_id(
            #     pagepb_id).get_attribute('innerHTML'))

        js_cmd = "var canvas = document.getElementById('{}');".format(canvas_id) + \
            "return canvas.toDataURL();"
        img_data = driver.execute_script(js_cmd)

        img_data = (img_data[22:]).encode()

        with open(f"./temp/{title}/{pages}.png", "wb") as fh:
            fh.write(base64.decodebytes(img_data))
    driver.quit()
    print('下载完毕,正在转码')
    conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')
예제 #19
0
def scrape(secure=False):

    options = EdgeOptions()
    options.use_chromium = True
    driver = Edge(options=options)

    query = input("▁ ▂ ▄ ▅ ▆ ▇ █ 𝐄𝐧𝐭𝐞𝐫 𝐭𝐡𝐞 𝐓𝐞𝐱𝐭 𝐭𝐨 𝐬𝐞𝐚𝐫𝐜𝐡 █ ▇ ▆ ▅ ▄ ▂ ▁\n\n ")

    print("\n𝘚𝘵𝘢𝘳𝘵𝘦𝘥 𝘚𝘤𝘳𝘢𝘱𝘪𝘯𝘨 ↦↦↦↦↦↦↦↦↦↦")
    print("\nPlease Wait ............\n")

    driver.get("https://www.twitter.com/login")
    driver.maximize_window()

    username = driver.find_element_by_xpath(
        '//input[@name="session[username_or_email]"]')
    username.send_keys("*****@*****.**")
    #password=getpass()

    userpas = driver.find_element_by_xpath(
        '//input[@name="session[password]"]')
    userpas.send_keys('-----')
    userpas.send_keys(Keys.RETURN)
    sleep(2)

    if secure:
        username = driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        username.send_keys("031-----")

        userpas = driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        userpas.send_keys('----')
        userpas.send_keys(Keys.RETURN)
        sleep(2)

    search = driver.find_element_by_xpath(
        '//input[@aria-label="Search query"]')
    search.send_keys('"پاک فوج" lang:ur -filter:links filter:replies')
    search.send_keys(Keys.RETURN)
    sleep(1.5)
    driver.find_element_by_link_text("Latest").click()
    data = []
    tweet_ids = set()
    last_position = driver.execute_script("return window.pageYOffset;")
    scrolling = True

    while scrolling:
        posts = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
        for post in posts[-15:]:
            tweet = scrap_tweets(post)
            if tweet:
                tweet_id = "".join(tweet)
                if tweet_id not in tweet_ids:
                    tweet_ids.add(tweet_id)
                    data.append(tweet)

        scroll_attempt = 0
        while True:
            driver.execute_script(
                "window.scrollTo(0,document.body.scrollHeight);")
            sleep(1)

            curr_position = driver.execute_script("return window.pageYOffset;")
            if last_position == curr_position:
                scroll_attempt += 1

                if scroll_attempt >= 3:
                    scrolling = False
                    break

                else:
                    sleep(2)
            else:
                last_position = curr_position
                break
    return data
예제 #20
0
from selenium.webdriver.common.keys import Keys
import time
import random

options = EdgeOptions()
options.use_chromium = True
word_list = [
    "https://pt.wikipedia.org/wiki/Border_Gateway_Protocol",
    "https://pt.wikipedia.org/wiki/Multi_Protocol_Label_Switching",
    "https://pt.wikipedia.org/wiki/Open_Shortest_Path_First"
]
driver = Edge(options=options)
driver.get("https://pt.wikipedia.org/")
driver.maximize_window()
time.sleep(5)
driver.get(random.choice(word_list))

time.sleep(4)
driver.execute_script("window.scrollTo(0, 300)")
time.sleep(4)
driver.execute_script("window.scrollTo(300, 600)")
time.sleep(4)
driver.execute_script("window.scrollTo(600, 900)")
time.sleep(4)
driver.execute_script("window.scrollTo(900, 1200)")
time.sleep(4)
driver.execute_script("window.scrollTo(1200, 1500)")

time.sleep(4)
driver.close()
def main():
    searchtext = input()
    num_requested = int(input())
    number_of_scrolls = num_requested / 400 + 1
    # number_of_scrolls * 400 images will be opened in the browser

    if not os.path.exists(download_path + searchtext.replace(" ", "_")):
        os.makedirs(download_path + searchtext.replace(" ", "_"))

    url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
    chrome_driver_path = "msedgedriver.exe"
    browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe"
    option = EdgeOptions()
    option.binary_location = browser_path
    driver = Edge(executable_path = chrome_driver_path, options = option)
    driver.get(url)

    headers = {}
    headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    extensions = {"jpg", "jpeg", "png", "gif"}
    img_count = 0
    downloaded_img_count = 0

    for _ in range(int(number_of_scrolls)):
        for __ in range(15):
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        time.sleep(0.5)
        try:
            driver.find_element_by_xpath(
                "//input[@value='Show more results']").click()
        except Exception as e:
            print("Less images found: {}".format(e))
            break

    html = driver.page_source.split('"')
    imges = []
    links = []
    for i in html:
        if i.startswith('https:') and ('gstatic' not in i) and ('google' not in i):
            links.append(i.split('"')[0])
    for i in html:
        if i.startswith('http') and 'usqp=CAU' in i.split('.')[-1]:
            imges.append(i.split('"')[0])
    for i in html:
        if i.startswith('http') and i.split('"')[0].split('.')[-1] in extensions:
            imges.append(i.split('"')[0])
    links = list(set(links))
    imges = list(set(imges))
    print(imges)
    links_left = Diff(links, imges)

    #removing duplicates
    urls_new = []
    [urls_new.append(x) for x in links_left if x not in urls_new]

    file1 = open("page_source.txt", "w", encoding='utf8')
    file1.writelines(urls_new)
    img_type = []
    print("Total images: {}\n".format(len(imges)))
    for img in imges:
        img_count += 1
        print("Downloading image {}:{}".format(img_count, img))
        img_type = img.rsplit('.', 1)
        try:
            req = Request(img, headers=headers)
            raw_img = urlopen(req).read()
            f = open(download_path+searchtext.replace(" ", "_")+"/" +
                     str(downloaded_img_count)+"."+"jpeg", "wb")
            f.write(raw_img)
            f.close
            downloaded_img_count += 1
        except Exception as e:
            print("Download failed: {}".format(e))
        finally:
            print
        if downloaded_img_count >= num_requested:
            break

    print("Total downloaded: {}/{}".format(downloaded_img_count, img_count))
    print("Total images: {}\n".format(len(urls_new)))

    for url in urls_new:
        img_count = img_scp.img_download(url, download_path+searchtext.replace(" ", "_")+"/", img_count)
    driver.quit()
예제 #22
0
class QCourse:
    def __init__(self):
        # 初始化options
        self.prefs = {"download.default_directory": os.getcwd()}
        self.options = EdgeOptions()
        self.options.use_chromium = True
        self.options.add_argument("log-level=3")
        self.options.add_experimental_option('excludeSwitches',
                                             ['enable-logging'])
        self.options.add_experimental_option('prefs', self.prefs)
        self.options.add_argument("--mute-audio")

        self.login_url = 'https://ke.qq.com/'

        # Mac 下配置 options 报错,故扔掉了。如果是 Windows,请使用路径下面的 msedgedriver.exe。(注释掉下面一行,放开下下行)
        self.driver = Edge(executable_path=os.path.join(
            BASE_DIR, 'msedgedriver'),
                           capabilities={})
        # self.driver = Edge(executable_path='msedgedriver.exe', options=self.options)

        # self.driver = Edge(executable_path=os.path.join(BASE_DIR, 'msedgedriver'), capabilities=desired_cap, options=self.options)

    def login(self):
        self.driver.get('https://ke.qq.com/')
        self.driver.find_element_by_id('js_login').click()
        time.sleep(1)

        WebDriverWait(self.driver, 300).until_not(
            EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask')))

        dictCookies = self.driver.get_cookies()
        jsonCookies = json.dumps(dictCookies)
        with open('cookies.json', 'w') as f:
            f.write(jsonCookies)
        print('登陆成功!')

    def close(self):
        self.driver.close()

    def _get_video(self, video_url=None, path=None, index=None):
        if not video_url:
            print('请输入视频url!')
        # 跳转一次没法跳转,可能是设置了preventDefault
        self.driver.get(video_url)
        self.driver.get(video_url)
        try:
            # 等待视频开始播放
            WebDriverWait(self.driver, 60).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'loki-time')))
            WebDriverWait(
                self.driver,
                60).until_not(lambda driver: driver.find_element_by_class_name(
                    'loki-time').get_attribute("innerHTML") == '00:00 / 00:00')

            title = self.driver.title
            if index is not None:
                title = "{:02}_{}".format(index, title)

            networks = self.driver.execute_script(
                'return window.performance.getEntries()')
            ts_url = key_url = ''
            for network in networks:
                if '.ts?start' in network.get('name'):
                    ts_url = network.get('name')
                elif 'get_dk' in network.get('name'):
                    key_url = network.get('name')
            download_single(ts_url, key_url, title, path)
        except TimeoutException:
            # 如果超时,可能是下载的资料,则查看是否有下载按钮,有的话,就下载
            title = self.driver.title
            try:
                down_btn = self.driver.find_element_by_class_name(
                    'download-btn')
                if down_btn.text == '下载资料':
                    url = down_btn.get_attribute('href')
                    download_zip_doc(url, title, path)
            except Exception:
                print('没有找到视频,也没有找到可下载的文件,可能是还未开课')

    def get_video(self, video_url=None, path=None, index=None):
        if isinstance(video_url, list):
            for url in video_url:
                if url:
                    self._get_video(url, path, index)
        else:
            self._get_video(video_url, path, index)

    def load_cookies(self):
        if not os.path.exists('cookies.json'):
            self.login()
        with open('cookies.json', 'r') as f:
            listCookies = json.loads(f.read())
        self.driver.get(self.login_url)
        for cookie in listCookies:
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': cookie['httpOnly'],
                'name': cookie['name'],
                'path': '/',
                'secure': cookie['secure'],
                'value': cookie['value']
            })
        for cookie in utils.get_cookies_dic_list():
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': False,
                'name': cookie[0],
                'path': '/',
                'secure': False,
                'value': cookie[1]
            })
class TwitterBot():
    def __init__(self):
        self.driver = Edge()
        self.driver.maximize_window()
        self.driver.get('https://twitter.com')
        self.driver.implicitly_wait(3)

    def goToTwitter(self):
        self.driver.get('https://twitter.com')

    def login(self):
        self.driver.find_element_by_xpath("//a[@href='/login']").click()

        #I used sleep because before this time there is another instance of an element named like below.
        #It is crucial to get the right element in order to interact with it.
        sleep(1)
        self.driver.find_element_by_xpath(
            "//input[@name='session[username_or_email]']").send_keys(username)
        self.driver.find_element_by_xpath(
            "//input[@name='session[password]']").send_keys(password)

        self.driver.find_element_by_xpath(
            "//div[@data-testid='LoginForm_Login_Button']").click()

    def basicSearch(self, topic):
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").send_keys(topic)
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").submit()

    def advancedSearch(self, exact, any, none, hashtags, dateFrom, dateTo):
        finalSearch = ''
        #This is to accommodate for different search types that a user might want.
        if exact != None:
            finalSearch += '"' + exact + '" '
        if any != None:
            finalSearch += '(' + any + ') '
        if none != None:
            finalSearch += '-' + none + ' '
        if hashtags != None:
            finalSearch += '(#' + hashtags + ') '
        if dateTo != None:
            finalSearch += 'until:' + dateTo + ' '
        if dateFrom != None:
            finalSearch += 'since:' + dateFrom + ' '

        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").send_keys(
                finalSearch)
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").submit()

    def scrapeTweets(self, desiredNum):
        allLines = ''
        oldDataLines = []
        dataLines = ['init']
        tweetsFile = open('tweets.csv', 'w')

        #I included this array to help clean data later
        dirtyArray = [
            'Quote Tweet', 'Promoted', 'Show this thread', '', '\n', ' '
        ]
        numDataLines = 0
        while numDataLines < desiredNum and oldDataLines != dataLines:

            oldDataLines = dataLines
            sleep(1)
            #all these are different types of data that I do not want to pick up.
            dirtyData = self.driver.find_elements_by_xpath(
                "//div[@class='css-1dbjc4n r-1d09ksm r-18u37iz r-1wbh5a2']")
            dirtyData2 = self.driver.find_elements_by_xpath(
                "//div[@class = 'css-1dbjc4n r-18u37iz r-1wtj0ep r-156q2ks r-1mdbhws']"
            )
            dirtyData3 = self.driver.find_elements_by_xpath(
                "//div[contains(text(),'Replying to')]")
            dirtyData4 = self.driver.find_elements_by_xpath(
                "//div[@role = 'blockquote']")

            #adding all the dirty data into one array
            for dirt in dirtyData2:
                dirtyData.append(dirt)

            for dirt in dirtyData3:
                dirtyData.append(dirt)

            for dirt in dirtyData4:
                dirtyData.append(dirt)

            #the data is stored with strings with many lines so I split the strings up by line and have an array where each index is one lin
            dirtyLines = []
            for dirt in dirtyData:
                dirt = dirt.text
                chunks = dirt.split('\n')
                for chunk in chunks:
                    dirtyLines.append(chunk)

            #this includes dirty data that will be weeded out later
            data = self.driver.find_elements_by_xpath(
                "//div[@data-testid='tweet']")

            #same thing I did with dirtyLines
            dataLines = []
            for datapoint in data:
                datapoint = datapoint.text
                chunks = datapoint.split('\n')
                for chunk in chunks:
                    dataLines.append(chunk)

            #I check oldDataLines as well to avoid redundancy
            for line in dataLines:
                if line not in dirtyLines and line not in oldDataLines and line not in dirtyArray:
                    if numDataLines >= desiredNum:
                        break
                    try:
                        noPunctuationLine = re.sub(r'[^\w\s]', '', line)
                        tweetsFile.write(noPunctuationLine)
                        tweetsFile.write("\n")
                        allLines += line
                        numDataLines += 1
                    except Exception:
                        print('This data point not encodable.')

            height = self.driver.execute_script(
                "return document.documentElement.scrollHeight")
            self.driver.execute_script("window.scrollTo(0, " + str(height) +
                                       ");")

        tweetsFile.close()
        return allLines
예제 #24
0
 password.send_keys(my_password)
 password.send_keys(Keys.RETURN)
 sleep(1)
 
 # find search input and search for term
 search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
 search_input.send_keys(search_term)
 search_input.send_keys(Keys.RETURN)
 sleep(1)
 
 # navigate to historical 'latest' tab
 driver.find_element_by_link_text('Latest').click()
 
 # get all tweets on the page
 tweet_ids = set()
 last_position = driver.execute_script("return window.pageYOffset;")
 scrolling = True
 while scrolling:
     page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
     for card in page_cards[-15:]:
         if card is not None:
             tweet = get_tweet_data(card)
             if tweet is not None:
                 tweetL = list(tweet)
                 tweetL.append(x)
                 tweetL.append(j)
                 tweet = tuple(tweetL)
                 if tweet:
                     if tweet[2]<cutoff_date:
                         scrolling=False
                     else:
예제 #25
0
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
import time

options = EdgeOptions()
options.use_chromium = True

driver = Edge(options=options)
driver.maximize_window()
driver.get("https://www.youtube.com/")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
actions = driver.find_element_by_tag_name('body');  
actions.send_keys(Keys.TAB * 4, Keys.ENTER)

time.sleep(1)



driver.get("https://www.youtube.com/results?search_query=Drive+Drive+Drive+song+(Impractical+Jokers)+-+2+HOUR+VERSION")
time.sleep(5)
inputElement = driver.find_element_by_xpath("/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer[1]")
inputElement.click()


time.sleep(400)
driver.close()