def open_browser(self): if self.verbose: print('Opening browser...') # Set whether to show browser UI while fetching options = _Options() if not self.show_browser_ui: options.add_argument('--headless') options.add_argument('--disable-gpu') # Launch Chrome self.browser = _webdriver.Chrome(chrome_options=options)
def _get_archive_dates(self): # Initialize calendar navigation print(f'Initializing calendar navigation for {self.feed_name}...') # Set whether to show browser UI while fetching options = _Options() if not self.show_browser_ui: options.add_argument('--headless') options.add_argument('--disable-gpu') # Launch Chrome with _webdriver.Chrome(executable_path=self.webdriver_path, chrome_options=options) as browser: browser.get(self.archive_url) self.archive_calendar = ArchiveCalendar(self, browser, get_dates=True) self.start_date = self.archive_calendar.start_date self.end_date = self.archive_calendar.end_date self.archive_calendar = None print('Initialization complete.\n') print(self)
def build(self, start=None, end=None, days_back=None, chronological=False, rebuild=False): """ Build archive entry data for the BroadcastifyArchive's feed_id and populate as a dictionary to the .entries attribute. Parameters ---------- start : datetime.date The earliest date for which to populate the archive. If None, go from the earliest date on the calendar (inclusive). end : datetime.date The latest date for which to populate the archive. If None, go to the latest date on the calendar (inclusive). days_back : int The number of days before the current day to retrieve informa- tion for. A value of `0` retrieves only archive entries corres- ponding to the current day. Pass either days_back OR a valid combination of start/end dates. chronological : bool By default, start with the latest date and work backward in time. If True, reverse that. rebuild : bool Specifies that existing data in the `entries` list should be overwritten with data newly fetched from Broadcastify. """ # Prevent the user from unintentionally erasing existing archive info if self.entries and not rebuild: raise ValueError( f'Archive already built: Entries already exist for' f' this BroadcastifyArchive. To erase and rebuild,' f' specify `rebuild=True` when calling .build()') # Make sure valid arguments were passed ## Either start/end or days_back; not both if (start or end) and days_back: raise ValueError(f'Expected either `days_back` OR a `start`/`end` ' f'combination. Both were passed.') ## `days_back` must be a non-negative integer if days_back is not None: bad_days_back = False try: if days_back < 0: bad_days_back = True except: bad_days_back = True if bad_days_back: raise TypeError(f'`days_back` must be a non-negative integer.') # Capture the archive end date to count back from end = self.end_date # Make sure days_back is no larger than the archive date range size start = self.start_date archive_size = (end - start).days if days_back > archive_size: _warnings.warn( f"The number of days_back passed ({days_back}) " f"exceeds the size of the archive's date range (" f"{archive_size}). Only valid dates will be " f"built.") days_back = archive_size else: ## Check that `start` and `end` within archive's start/end dates ## If they weren't passed, set them to the archive's start/end dates out_of_range = '' if start: if start < self.start_date: out_of_range = (f'start date out of archive range: ' f'{start} < {self.start_date}\n') elif start > self.end_date: out_of_range = (f'start date out of archive range: ' f'{start} > {self.end_date}\n') else: start = self.start_date if end: if end > self.end_date: out_of_range += (f'end date out of archive range: ' f'{end} > {self.end_date}') elif end < self.start_date: out_of_range += (f'end date out of archive range: ' f'{end} < {self.start_date}') else: end = self.end_date if out_of_range: raise AttributeError(out_of_range) ## `start` cannot be > `end` if start > end: raise AttributeError(f'`start` date ({start}) cannot be after ' f'`end` date ({end}).') # Get size of the date range days_back = (end - start).days # Adjust for exclusive end of range() days_back += 1 # Build the list of dates to scrape date_list = sorted( [end - _dt.timedelta(days=x) for x in range(days_back)], reverse=not (chronological)) archive_entries = [] # Spin up a browser and an ArchiveCalendar # Set whether to show browser UI while fetching print('Launching webdriver...') options = _Options() if not self.show_browser_ui: options.add_argument('--headless') options.add_argument('--disable-gpu') with _webdriver.Chrome(executable_path=self.webdriver_path, chrome_options=options) as browser: browser.get(self.archive_url) self.arch_cal = ArchiveCalendar(self, browser) # Get archive entries for each date in list t = _tqdm(date_list, desc=f'Building dates', leave=True, dynamic_ncols=True) for date in t: t.set_description(f'Building {date}', refresh=True) self.arch_cal.go_to_date(date) if self.arch_cal.entries_for_date: archive_entries.extend(self.arch_cal.entries_for_date) # Empty & replace the current archive entries self.entries = [] # Store URIs and end times in the entries attritbute for entry in archive_entries: entry_dict = { 'uri': entry[0], 'start_time': entry[1], 'end_time': entry[2] } self.entries.append(entry_dict) self.earliest_entry = min( [entry['end_time'] for entry in self.entries]).date() self.latest_entry = max([entry['end_time'] for entry in self.entries]).date() print(self)
def oauth_authenticate(self, client_id, expiration): # Authenticate with RC View single-sign-on. if _print_messages: self._spinner.text = 'Authenticating user' if self._existing_tokens: self._refresh_token = self._existing_tokens['refresh_token'] self._token = self._existing_tokens['token'] return self._token parameters = { 'client_id': client_id, 'response_type': 'code', 'expiration': -1, 'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob' } url = self.baseurl + 'oauth2/authorize' paramstring = _urlencode(parameters) codeurl = "{}?{}".format(url, paramstring) options = _Options() options.set_headless(True) options.add_argument('--log-level=3') driver = _webdriver.Chrome(chrome_options=options) driver.get(codeurl) delay = 10 try: using_redcross_element = _WebDriverWait(driver, delay).\ until(_EC.presence_of_element_located((_By.ID, 'idp_Name'))) except _TimeoutException: driver.quit() if _print_messages: self._spinner.fail( 'Accessing Red Cross single-sign-on took too much time.') using_redcross_element.click() try: username_element = _WebDriverWait(driver, delay).\ until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[4]/input'))) password_element = _WebDriverWait(driver, delay).\ until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[5]/input'))) signin_element = _WebDriverWait(driver, delay).\ until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[6]/button'))) except _TimeoutException: driver.quit() if _print_messages: self._spinner.fail( 'Accessing Red Cross single-sign-on took too much time.') username_element.send_keys(self._username) password_element.send_keys(self._password) signin_element.click() try: code_element = _WebDriverWait(driver, delay).\ until(_EC.presence_of_element_located((_By.ID, 'code'))) except _TimeoutException: driver.quit() if _print_messages: self._spinner.fail( 'Receiving an authentication code took too much time.') code = code_element.get_attribute('value') driver.quit() parameters = { 'client_id': client_id, 'grant_type': 'authorization_code', 'code': code, 'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob' } token_info = self.post('oauth2/token', parameters, ssl=True, add_token=False) self._refresh_token = token_info['refresh_token'] self._token = token_info['access_token'] return self._token