def scrape_followers( driver: webdriver, username: str, cookies: List[Dict[str, Any]] = None) -> Tuple[str, str, Set[str], Set[str]]: # CSS Selector for followers and following lists list_css: str = "div[role='dialog'] a.notranslate" if cookies: # Load any page before setting cookies driver.get("https://www.instagram.com/data/manifest.json") for cookie in cookies: driver.add_cookie(cookie) # Load account page driver.get(f"https://www.instagram.com/{username}/") num_followers: str = driver.find_element_by_css_selector( "a[href*='followers'] span").text num_following: str = driver.find_element_by_css_selector( "a[href*='following'] span").text # Click the 'Followers' link driver.find_element_by_partial_link_text("followers").click() WebDriverWait(driver, 10).until( EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css))) # TODO: Scrolling Magic here _followers: List = driver.find_elements_by_css_selector(list_css) followers: Set[str] = {i.text for i in _followers} driver.find_element_by_css_selector( "div[role='dialog'] button span[aria-label='Close']").click() # Click the 'Following' link driver.find_element_by_partial_link_text("following").click() WebDriverWait(driver, 10).until( EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css))) # TODO: Scrolling Magic here _following: List = driver.find_elements_by_css_selector(list_css) following: Set[str] = {i.text for i in _following} return (num_followers, num_following, followers, following)
def run(self, driver: webdriver, search_url: str, query: Query, location: str) -> None: """ Run strategy :param driver: webdriver :param search_url: str :param query: Query :param location: str :return: None """ tag = f'[{query.query}][{location}]' processed = 0 pagination_index = 1 # Open main page first to verify/set the session debug(tag, f'Opening {HOME_URL}') driver.get(HOME_URL) sleep(self.scraper.slow_mo) if not AuthenticatedStrategy.__is_authenticated_session(driver): info(tag, 'Setting authentication cookie') try: driver.add_cookie({ 'name': 'li_at', 'value': Config.LI_AT_COOKIE, 'domain': '.www.linkedin.com' }) except BaseException as e: error(tag, e) error(tag, traceback.format_exc()) return # Open search url info(tag, f'Opening {search_url}') driver.get(search_url) sleep(self.scraper.slow_mo) # Verify session if not AuthenticatedStrategy.__is_authenticated_session(driver): message = 'The provided session cookie is invalid. ' \ 'Check the documentation on how to obtain a valid session cookie.' raise InvalidCookieException(message) # Wait container try: WebDriverWait(driver, 5).until(ec.presence_of_element_located((By.CSS_SELECTOR, Selectors.container))) except BaseException as e: warn(tag, 'No jobs found, skip') return # Pagination loop while processed < query.options.limit: # Verify session in loop if not AuthenticatedStrategy.__is_authenticated_session(driver): warn(tag, 'Session is no longer valid, this may cause the scraper to fail') self.scraper.emit(Events.INVALID_SESSION) else: info(tag, 'Session is valid') AuthenticatedStrategy.__accept_cookies(driver, tag) AuthenticatedStrategy.__close_chat_panel(driver, tag) job_index = 0 job_tot = driver.execute_script('return document.querySelectorAll(arguments[0]).length;', Selectors.jobs) if job_tot == 0: info(tag, 'No jobs found, skip') break info(tag, f'Found {job_tot} jobs') # Jobs loop while job_index < job_tot and processed < query.options.limit: sleep(self.scraper.slow_mo) tag = f'[{query.query}][{location}][{processed + 1}]' # Extract job main fields debug(tag, 'Evaluating selectors', [ Selectors.jobs, Selectors.links, Selectors.companies, Selectors.places, Selectors.dates]) try: job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script( ''' const index = arguments[0]; const job = document.querySelectorAll(arguments[1])[index]; const link = job.querySelector(arguments[2]); // Click job link and scroll link.scrollIntoView(); link.click(); const linkUrl = link.getAttribute("href"); const jobId = job.getAttribute("data-job-id"); const title = job.querySelector(arguments[3]) ? job.querySelector(arguments[3]).innerText : ""; const company = job.querySelector(arguments[4]) ? job.querySelector(arguments[4]).innerText : ""; const place = job.querySelector(arguments[5]) ? job.querySelector(arguments[5]).innerText : ""; const date = job.querySelector(arguments[6]) ? job.querySelector(arguments[6]).getAttribute('datetime') : ""; return [ jobId, linkUrl, title, company, place, date ]; ''', job_index, Selectors.jobs, Selectors.links, Selectors.title, Selectors.companies, Selectors.places, Selectors.dates) job_title = normalize_spaces(job_title) job_company = normalize_spaces(job_company) job_place = normalize_spaces(job_place) # Join with base location if link is relative job_link = urljoin(get_location(driver.current_url), job_link) sleep(self.scraper.slow_mo) # Wait for job details to load debug(tag, f'Loading details job {job_id}') load_result = AuthenticatedStrategy.__load_job_details(driver, job_id) if not load_result['success']: error(tag, load_result['error']) job_index += 1 continue # Extract debug(tag, 'Evaluating selectors', [Selectors.description]) job_description, job_description_html = driver.execute_script( ''' const el = document.querySelector(arguments[0]); return [ el.innerText, el.outerHTML ]; ''', Selectors.description) company_size = driver.execute_script( ''' const panel = document.querySelector(arguments[0]); const company = panel.querySelector(arguments[1]); const company_size = company.querySelector(arguments[2]).innerText; return company_size; ''', Selectors.detailsPanel, '.jobs-details-job-summary__section--center', '.jobs-details-job-summary__text--ellipsis') # TODO how to extract apply link? # Extract criteria debug(tag, 'Evaluating selectors', [Selectors.criteria]) job_seniority_level, job_employment_type, job_industries, job_function = driver.execute_script( r''' const nodes = document.querySelectorAll(arguments[0]); const criteria = [ "Seniority Level", "Employment Type", "Industry", "Job Functions", ]; return Array.from(criteria.map(c => { const el = Array.from(nodes).find(node => node.innerText.trim() === c); if (el && el.nextElementSibling) { const sibling = el.nextElementSibling; return sibling.innerText .replace(/[\s]{2,}/g, ", ") .replace(/[\n\r]+/g, " ") .trim(); } else { return ""; } })); ''', Selectors.criteria) except BaseException as e: # Verify session on error if not AuthenticatedStrategy.__is_authenticated_session(driver): warn(tag, 'Session is no longer valid, this may cause the scraper to fail') self.scraper.emit(Events.INVALID_SESSION) error(tag, e, traceback.format_exc()) self.scraper.emit(Events.ERROR, str(e) + '\n' + traceback.format_exc()) job_index += 1 continue data = EventData( query=query.query, location=location, job_id=job_id, job_index=job_index, title=job_title, company=job_company, place=job_place, date=job_date, link=job_link, apply_link='', description=job_description, description_html=job_description_html, seniority_level=job_seniority_level, job_function=job_function, employment_type=job_employment_type, industries=job_industries, company_size=company_size) info(tag, 'Processed') job_index += 1 processed += 1 self.scraper.emit(Events.DATA, data) # Try fetching more jobs if processed < query.options.limit and job_index == job_tot: job_tot = driver.execute_script('return document.querySelectorAll(arguments[0]).length;', Selectors.jobs) # Check if we reached the limit of jobs to process if processed == query.options.limit: break # Try to paginate pagination_index += 1 info(tag, f'Pagination requested ({pagination_index})') paginate_result = AuthenticatedStrategy.__paginate(driver, pagination_index) if not paginate_result['success']: info(tag, "Couldn't find more jobs for the running query") return
def run(self, driver: webdriver, cdp: CDP, search_url: str, query: Query, location: str, apply_link: bool) -> None: """ Run strategy :param driver: webdriver :param cdp: CDP :param search_url: str :param query: Query :param location: str :param apply_link: bool :return: None """ tag = f'[{query.query}][{location}]' metrics = EventMetrics() pagination_index = 0 pagination_size = 25 # Open main page first to verify/set the session debug(tag, f'Opening {HOME_URL}') driver.get(HOME_URL) sleep(self.scraper.slow_mo) if not AuthenticatedStrategy.__is_authenticated_session(driver): info(tag, 'Setting authentication cookie') try: driver.add_cookie({ 'name': 'li_at', 'value': Config.LI_AT_COOKIE, 'domain': '.www.linkedin.com' }) except BaseException as e: error(tag, e) error(tag, traceback.format_exc()) return # Open search url info(tag, f'Opening {search_url}') driver.get(search_url) sleep(self.scraper.slow_mo) # Verify session if not AuthenticatedStrategy.__is_authenticated_session(driver): message = 'The provided session cookie is invalid. ' \ 'Check the documentation on how to obtain a valid session cookie.' raise InvalidCookieException(message) # Wait container try: WebDriverWait(driver, 5).until( ec.presence_of_element_located( (By.CSS_SELECTOR, Selectors.container))) except BaseException as e: warn(tag, 'No jobs found, skip') return # Pagination loop while metrics.processed < query.options.limit: # Verify session in loop if not AuthenticatedStrategy.__is_authenticated_session(driver): warn( tag, 'Session is no longer valid, this may cause the scraper to fail' ) self.scraper.emit(Events.INVALID_SESSION) else: info(tag, 'Session is valid') AuthenticatedStrategy.__accept_cookies(driver, tag) AuthenticatedStrategy.__close_chat_panel(driver, tag) AuthenticatedStrategy.__accept_privacy(driver, tag) job_index = 0 job_tot = driver.execute_script( 'return document.querySelectorAll(arguments[0]).length;', Selectors.jobs) if job_tot == 0: info(tag, 'No jobs found, skip') break # Jobs loop while job_index < job_tot and metrics.processed < query.options.limit: sleep(self.scraper.slow_mo) tag = f'[{query.query}][{location}][{pagination_index * pagination_size + job_index + 1}]' # Try to recover focus to main page in case of unwanted tabs still open # (generally caused by apply link click). if len(driver.window_handles) > 1: debug('Try closing unwanted targets') try: targets_result = cdp.get_targets() # try to close other unwanted tabs (targets) if targets_result['success']: for target in targets_result['result'].targets: if 'linkedin.com/jobs' not in target.url: debug(f'Closing target {target.url}') cdp.close_target(target.targetId) finally: debug('Switched to main handle') driver.switch_to.window(driver.window_handles[0]) try: # Extract job main fields debug(tag, 'Evaluating selectors', [ Selectors.jobs, Selectors.link, Selectors.company, Selectors.place, Selectors.date ]) job_id, job_link, job_title, job_company, job_company_link, \ job_company_img_link, job_place, job_date = \ driver.execute_script( ''' const index = arguments[0]; const job = document.querySelectorAll(arguments[1])[index]; const link = job.querySelector(arguments[2]); // Click job link and scroll link.scrollIntoView(); link.click(); const linkUrl = link.getAttribute("href"); const jobId = job.getAttribute("data-job-id"); const title = job.querySelector(arguments[3]) ? job.querySelector(arguments[3]).innerText : ""; let company = ""; let companyLink = ""; const companyElem = job.querySelector(arguments[4]); if (companyElem) { company = companyElem.innerText; const protocol = window.location.protocol + '//'; const host = window.location.host; companyLink = `${protocol}${host}${companyElem.getAttribute('href')}`; } const companyImgLink = job.querySelector("img") ? job.querySelector("img").getAttribute("src") : ""; const place = job.querySelector(arguments[5]) ? job.querySelector(arguments[5]).innerText : ""; const date = job.querySelector(arguments[6]) ? job.querySelector(arguments[6]).getAttribute('datetime') : ""; return [ jobId, linkUrl, title, company, companyLink, companyImgLink, place, date, ]; ''', job_index, Selectors.jobs, Selectors.link, Selectors.title, Selectors.company_link, Selectors.place, Selectors.date) job_title = normalize_spaces(job_title) job_company = normalize_spaces(job_company) job_place = normalize_spaces(job_place) # Join with base location if link is relative job_link = urljoin(get_location(driver.current_url), job_link) sleep(self.scraper.slow_mo) # Wait for job details to load debug(tag, f'Loading details job {job_id}') load_result = AuthenticatedStrategy.__load_job_details( driver, job_id) if not load_result['success']: error(tag, load_result['error'], exc_info=False) info(tag, 'Skipped') job_index += 1 metrics.failed += 1 continue # Extract debug(tag, 'Evaluating selectors', [Selectors.description]) job_description, job_description_html = driver.execute_script( ''' const el = document.querySelector(arguments[0]); return [ el.innerText, el.outerHTML ]; ''', Selectors.description) # Extract insights debug(tag, 'Evaluating selectors', [Selectors.insights]) job_insights = driver.execute_script( r''' const nodes = document.querySelectorAll(arguments[0]); return Array.from(nodes).map(e => e.textContent.replace(/[\n\r\t ]+/g, ' ').trim()); ''', Selectors.insights) # Apply link job_apply_link = '' if apply_link: try: debug(tag, 'Evaluating selectors', [Selectors.applyBtn]) driver.execute_script( r''' const applyBtn = document.querySelector(arguments[0]); if (applyBtn) { applyBtn.click(); return true; } return false; ''', Selectors.applyBtn) if len(driver.window_handles) > 1: debug(tag, 'Try extracting apply link') targets_result = cdp.get_targets() if targets_result['success']: # The first not attached target should be the apply page apply_target = next( (e for e in targets_result['result']. targets if not e.attached), '') if apply_target: job_apply_link = apply_target.url cdp.close_target(apply_target.targetId) else: warn(tag, 'Failed to extract apply link', targets_result['error']) except BaseException as e: warn(tag, 'Failed to extract apply link', e) data = EventData(query=query.query, location=location, job_id=job_id, job_index=job_index, title=job_title, company=job_company, company_link=job_company_link, company_img_link=job_company_img_link, place=job_place, date=job_date, link=job_link, apply_link=job_apply_link, description=job_description, description_html=job_description_html, insights=job_insights) info(tag, 'Processed') job_index += 1 metrics.processed += 1 self.scraper.emit(Events.DATA, data) # Try fetching more jobs if metrics.processed < query.options.limit and job_index == job_tot < pagination_size: load_jobs_result = AuthenticatedStrategy.__load_jobs( driver, job_tot) if load_jobs_result['success']: job_tot = load_jobs_result['count'] if job_index == job_tot: break except BaseException as e: try: # Verify session on error if not AuthenticatedStrategy.__is_authenticated_session( driver): warn( tag, 'Session is no longer valid, this may cause the scraper to fail' ) self.scraper.emit(Events.INVALID_SESSION) error(tag, e, traceback.format_exc()) self.scraper.emit( Events.ERROR, str(e) + '\n' + traceback.format_exc()) finally: info(tag, 'Skipped') job_index += 1 metrics.failed += 1 continue tag = f'[{query.query}][{location}]' info(tag, 'No more jobs to process in this page') # Check if we reached the limit of jobs to process if metrics.processed == query.options.limit: info(tag, 'Query limit reached!') info(tag, 'Metrics:', str(metrics)) self.scraper.emit(Events.METRICS, metrics) break else: metrics.missed += pagination_size - job_index info(tag, 'Metrics:', str(metrics)) self.scraper.emit(Events.METRICS, metrics) # Try to paginate pagination_index += 1 info(tag, f'Pagination requested [{pagination_index}]') offset = pagination_index * pagination_size paginate_result = AuthenticatedStrategy.__paginate( driver, search_url, tag, offset) if not paginate_result['success']: info(tag, "Couldn't find more jobs for the running query") return