示例#1
0
    def before(self):
        self.driver = init_driver()
        self.page_factory = PageFactory(self.driver)

        yield
        time.sleep(2)
        self.driver.quit()
示例#2
0
    def before_func(self):
        """前置操作"""
        self.driver = init_driver()  # 获取驱动对象
        self.page_factory = PageFactory(self.driver)

        yield
        time.sleep(2)
        self.driver.quit()
示例#3
0
def get_user_information(user, driver=None):
    """ get user information if the "from_account" argument is specified """

    driver = utils.init_driver()

    log_user_page(user, driver)

    if user is not None:

        try:
            following = driver.find_element_by_xpath(
                '//a[contains(@href,"/' + user +
                '/following")]/span[1]/span[1]').text
            followers = driver.find_element_by_xpath(
                '//a[contains(@href,"/' + user +
                '/followers")]/span[1]/span[1]').text
        except Exception as e:
            print(e)
            return

        try:
            span1 = driver.find_element_by_xpath(
                '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[1]'
            ).text
            span2 = driver.find_element_by_xpath(
                '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[2]'
            ).text
            join_date = span2
            location = span1

        except Exception as e:
            # print(e)
            join_date = driver.find_element_by_xpath(
                '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[1]'
            ).text
            location = ""

        try:
            element = driver.find_element_by_xpath(
                '//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]'
            )
            website = element.get_attribute("href")
        except Exception as e:
            print(e)
            website = ""

        try:
            desc = driver.find_element_by_xpath(
                '//div[contains(@data-testid,"UserDescription")]').text
        except Exception as e:
            print(e)
            desc = ""

        return following, followers, join_date, location, website, desc

    else:
        print("You should specify the user.")
        return
示例#4
0
def scrap(start_date,
          max_date,
          words=None,
          to_account=None,
          from_account=None,
          interval=5,
          navig="chrome",
          lang=None,
          headless=True,
          limit=float("inf"),
          display_type="Top",
          resume=False,
          proxy=None,
          hashtag=None):
    """
    scrap data from twitter using requests, starting from start_date until max_date. The bot make a search between each start_date and end_date
    (days_between) until it reaches the max_date.

    return:
    data : df containing all tweets scraped with the associated features.
    save a csv file containing all tweets scraped with the associated features.
    """

    # initiate the driver
    driver = init_driver(navig, headless, proxy)

    data = []
    tweet_ids = set()
    save_dir = "outputs"
    write_mode = 'w'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # start scraping from start_date until max_date
    init_date = start_date  # used for saving file
    # add interval to start_date to get end_date for te first search
    if words:
        path = save_dir + "/" + words.split("//")[0] + '_' + str(init_date).split(' ')[0] + '_' + \
               str(max_date).split(' ')[0] + '.csv'
    elif from_account:
        path = save_dir + "/" + from_account + '_' + str(init_date).split(
            ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv'
    elif to_account:
        path = save_dir + "/" + to_account + '_' + str(init_date).split(
            ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv'
    elif hashtag:
        path = save_dir + "/" + hashtag + '_' + str(init_date).split(
            ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv'

    if resume:
        start_date = str(get_last_date_from_csv(path))[:10]
        write_mode = 'a'
    # start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(
        start_date, '%Y-%m-%d') + datetime.timedelta(days=interval)

    refresh = 0
    # save_every = interval  #save every "interval" days

    # keep searching until max_date

    with open(path, write_mode, newline='', encoding='utf-8') as f:
        header = [
            'UserScreenName', 'UserName', 'Timestamp', 'Text', 'Emojis',
            'Comments', 'Likes', 'Retweets', 'Image link', 'Tweet URL'
        ]
        writer = csv.writer(f)
        if write_mode == 'w':
            writer.writerow(header)
        while end_date <= datetime.datetime.strptime(max_date, '%Y-%m-%d'):
            # number of scrolls
            scroll = 0

            # log search page between start_date and end_date
            if type(start_date) != str:
                log_search_page(driver=driver,
                                words=words,
                                start_date=datetime.datetime.strftime(
                                    start_date, '%Y-%m-%d'),
                                end_date=datetime.datetime.strftime(
                                    end_date, '%Y-%m-%d'),
                                to_account=to_account,
                                from_account=from_account,
                                lang=lang,
                                display_type=display_type,
                                hashtag=hashtag)
            else:
                log_search_page(driver=driver,
                                words=words,
                                start_date=start_date,
                                end_date=datetime.datetime.strftime(
                                    end_date, '%Y-%m-%d'),
                                to_account=to_account,
                                from_account=from_account,
                                hashtag=hashtag,
                                lang=lang,
                                display_type=display_type)

            # number of logged pages (refresh each <between_days>)
            refresh += 1
            # number of days crossed
            days_passed = refresh * interval

            # last position of the page : the purpose for this is to know if we reached the end of the page of not so
            # that we refresh for another <start_date> and <end_date>
            last_position = driver.execute_script("return window.pageYOffset;")
            # should we keep scrolling ?
            scrolling = True

            print("looking for tweets between " + str(start_date) + " and " +
                  str(end_date) + " ...")

            # start scrolling and get tweets
            tweet_parsed = 0

            driver, data, writer, tweet_ids, scrolling, tweet_parsed, scroll, last_position = \
                keep_scroling(driver, data, writer, tweet_ids, scrolling, tweet_parsed, limit, scroll, last_position)

            # keep updating <start date> and <end date> for every search
            if type(start_date) == str:
                start_date = datetime.datetime.strptime(
                    start_date, '%Y-%m-%d') + datetime.timedelta(days=interval)
            else:
                start_date = start_date + datetime.timedelta(days=interval)
            end_date = end_date + datetime.timedelta(days=interval)

    # close the web driver
    driver.close()

    return data
示例#5
0
 def test_func(self):
      self.driver = init_driver() # 驱动化设备实例对象
      self.factor_page = FactoryPage(self.driver)  #实例化工厂类
      yield
      time.sleep(3)
      self.driver.quit()
示例#6
0
args = parser.parse_args()
conf = load_config(args.conf)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
QUERIES = parameters["JOB_QUERIES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
MONGOUSER = credentials["MONGOUSER"]
MONGOPWD = credentials["MONGOPWD"]
HOST = parameters["HOST"]
client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
db = client["linkedin"]
jobs = db["jobs"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords="
for query in QUERIES:
    driver.get(JOB_SEARCH_URL + query)
    sleep(0.5)
    scroll_job_panel(driver)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    n_results_element = soup.find(class_="t-12 t-black--light t-normal")
    n_results_string = n_results_element.get_text()
    n_results = int(n_results_string.split()[0].replace(',', ''))
    job_urls = get_job_urls(soup)
    start = 25
    url = JOB_SEARCH_URL + query + "&start=" + str(start)
    while start < n_results:
def test_driver():
    driver = utils.init_driver(browser='chrome', debug=True)
    assert True
示例#8
0
def get_user_information(users, driver=None, headless=True):
    """ get user information if the "from_account" argument is specified """

    driver = utils.init_driver(headless=headless)

    users_info = {}

    for i, user in enumerate(users) :

        log_user_page(user, driver)

        if user is not None:

            try:
                following = driver.find_element_by_xpath(
                    '//a[contains(@href,"/following")]/span[1]/span[1]').text
                followers = driver.find_element_by_xpath(
                    '//a[contains(@href,"/followers")]/span[1]/span[1]').text
            except Exception as e:
                #print(e)
                return

            try:
                element = driver.find_element_by_xpath('//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]')
                website = element.get_attribute("href")
            except Exception as e:
                #print(e)
                website = ""

            try:
                desc = driver.find_element_by_xpath('//div[contains(@data-testid,"UserDescription")]').text
            except Exception as e:
                #print(e)
                desc = ""
            a=0
            try:
                join_date = driver.find_element_by_xpath(
                    '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[3]').text
                birthday = driver.find_element_by_xpath(
                    '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text
                location = driver.find_element_by_xpath(
                    '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
            except Exception as e: 
                #print(e)
                try :
                    join_date = driver.find_element_by_xpath(
                        '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text
                    span1 = driver.find_element_by_xpath(
                        '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
                    if hasNumbers(span1):
                        birthday = span1
                        location = ""
                    else : 
                        location = span1
                        birthday = ""
                except Exception as e: 
                    #print(e)
                    try : 
                        join_date = driver.find_element_by_xpath(
                            '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text
                        birthday = ""
                        location = ""
                    except Exception as e: 
                        #print(e)
                        join_date = ""
                        birthday = ""
                        location = ""
            print("--------------- " + user + " information : ---------------")
            print("Following : ", following)
            print("Followers : ", followers)
            print("Location : ", location)
            print("Join date : ", join_date)
            print("Birth date : ", birthday)
            print("Description : ", desc)
            print("Website : ", website)
            users_info[user] = [following, followers, join_date, birthday, location, website, desc]

            if i == len(users)-1 :
                driver.close()   
                return users_info
        else:
            print("You must specify the user")
            continue
示例#9
0
    "Poland Visa Application Center-Mogilev",
    "applicatest": [{
        "name": "Bahdan",
        "last_name": "Kazlouski",
        "gender": "Male",
        "date_born": "***********",
        "nationality": "BELARUS",
        "passport_number": "*********",
        "expare_date": "*************",
        "code_phone": "44",
        "phone_number": "************",
        "email": "*******************"
    }]
}

driver = init_driver(logger, options)
centre, category, sub_category = get_centre_category_sub_category(driver)
count = 1
while True:
    try:
        logger.warning(f"try to booking {count=}")
        sleep(randint(3, 5) + random())
        centre.send_keys(options['center'])

        sleep(randint(3, 5) + random())
        category.send_keys(options['category'])

        sleep(randint(5, 8) + random())
        sub_category.send_keys(options['sub_category'])

        sleep(random())