def download_url_report(self, report_url, write_to_disk=None, **kwargs): """ Downloads a Summit Learning report at a URL that triggers a CSV download Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. write_to_disk (string): The path for a directory to store the downloaded file. If nothing is provided, the file will be stored in a temporary directory and deleted at the end of this function. **kwargs: additional arguments to pass to Pandas read_excel or read_csv (depending on the report_url) Returns: A Pandas DataFrame of the report contents. """ report_download_url = interpret_report_url(self.base_url, report_url) if write_to_disk: csv_download_folder_path = write_to_disk else: csv_download_folder_path = mkdtemp() self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) self._login() self.log.debug( 'Getting report page at: {}'.format(report_download_url)) self.driver.get(report_download_url) self.log.debug('Starting download of: '.format(report_download_url)) wait_for_any_file_in_folder(csv_download_folder_path, "csv") self.log.debug('Download Finished.') df_report = pd.read_csv( get_most_recent_file_in_dir(csv_download_folder_path), **kwargs) # if the dataframe is empty (the report had no data), raise an error if df_report.shape[0] == 0: raise NoDataError( 'No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, report_url))) self.driver.close() if not write_to_disk: shutil.rmtree(csv_download_folder_path) return df_report
def download_url_report(self, report_url, temp_folder_name): """ Downloads a MealTime report. Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. temp_folder_name (string): The name of the folder in which this specific report's download files should be stored. Returns: A Pandas DataFrame of the report contents. """ csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name # set up the driver for execution self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) #self.driver = configure_selenium_chrome(csv_download_folder_path) self._login() # get the report url self.driver.get(interpret_report_url(self.base_url, report_url)) # select the download format (csv) and execute export_format_select = Select(self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl00')) try: export_format_select.select_by_value('CSV') dl_type = 'csv' except NoSuchElementException: export_format_select.select_by_value('EXCELNoHeader') dl_type = 'xls' self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl01').click() # wait until file has downloaded to close the browser. We can do this # because we delete the file before we return it, so the temp dir should # always be empty when this command is run # TODO add a try/except block here wait_for_any_file_in_folder(csv_download_folder_path, dl_type) # remove the header rows #xlrd.open_workbook(utils.get_most_recent_file_in_dir(csv_download_folder_path), formatting_info=False) if dl_type == 'csv': report_df = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path), header=2) else: report_df = pd.read_excel(get_most_recent_file_in_dir(csv_download_folder_path), header=3) # delete any files in the mealtime temp folder; we don't need them now # TODO: move this out of this function. It should happen as cleanup once # the whole DAG has completed delete_folder_contents(csv_download_folder_path) self.driver.close() # if the dataframe is empty (the report had no data), raise an error if report_df.shape[0] == 0: raise ValueError('No data in report for user {} at url: {}'.format(self.username, interpret_report_url(self.base_url, report_url))) return report_df
def __navigate_to_custom_report(self, report_name, school_year, download_folder_path=None): """Navigate to the page of the custom report tool that has the custom report on it""" if not download_folder_path: download_folder_path = self.temp_folder_path self.driver = DriverBuilder().get_driver( download_location=download_folder_path, headless=self.headless) self._login() self._set_year(school_year, self.driver) # get the custom reports page custom_reports_url = 'report/customReports' self.driver.get(interpret_report_url(self.base_url, custom_reports_url)) self.__remove_walk_me_and_support() # wait for the page to load and get the maximum number of pages total_num_pages_xpath = '//*[@id="content"]//*[@class="pagination "]/li[@data-page][last()]' elem = WebDriverWait(self.driver, self.wait_time).until( EC.presence_of_element_located((By.XPATH, total_num_pages_xpath))) num_pages = int(elem.get_attribute("data-page")) + 1 current_page = 0 while current_page < num_pages: report_name_xpath = "//tr[td//text()[contains(., '{}')]]".format( report_name) try: elem = self.driver.find_element_by_xpath(report_name_xpath) return current_page except NoSuchElementException: current_page += 1 if current_page < num_pages: next_page_xpath = '//*[@id="content"]//*[@class="pagination "]/li[@data-page={}]/a'.format( current_page) self.driver.find_element_by_xpath(next_page_xpath).click() # scroll back to the top of the page, prevents selenium clicking errors self.driver.execute_script("window.scrollTo(0, 0);") raise ReportNotFound
def download_url_report(self, report_url, write_to_disk=None, **kwargs): """ Downloads a Lexia report at a URL for a page with an 'export' button. Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. write_to_disk (string): The path for a directory to store the downloaded file. If nothing is provided, the file will be stored in a temporary directory and deleted at the end of this function. **kwargs: additional arguments to pass to Pandas read_excel or read_csv (depending on the report_url) Returns: A Pandas DataFrame of the report contents. """ report_download_url = interpret_report_url(self.base_url, report_url) # if user is trying to download a manage tab report (for convenience) if '/mylexiaweb/app/index.html#/groups/' in report_download_url: return self.download_manage_tab_report(report_url, write_to_disk, **kwargs) if write_to_disk: csv_download_folder_path = write_to_disk else: csv_download_folder_path = mkdtemp() self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) self._login() self.log.debug( 'Getting report page at: {}'.format(report_download_url)) self.driver.get(report_download_url) # find and click the download button elem = WebDriverWait(self.driver, self.wait_time).until( EC.presence_of_element_located( (By.XPATH, "//button[contains(text(), 'Export')]"))) self.log.debug('Starting download of: '.format(report_download_url)) elem.click() wait_for_any_file_in_folder(csv_download_folder_path, "xlsx") self.log.debug('Downloada Finished.') df_report = pd.read_excel( get_most_recent_file_in_dir(csv_download_folder_path), **kwargs) # if the dataframe is empty (the report had no data), raise an error if df_report.shape[0] == 0: raise ValueError('No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, report_url))) self.driver.close() if not write_to_disk: shutil.rmtree(csv_download_folder_path) return df_report
def download_manage_tab_report(self, report_url, write_to_disk=None, **kwargs): """ Downloads a Lexia report from the 'Manage' tab. Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. write_to_disk (string): The path for a directory to store the downloaded file. If nothing is provided, the file will be stored in a temporary directory and deleted at the end of this function. **kwargs: additional arguments to pass to Pandas read_csv Returns: A Pandas DataFrame of the report contents. """ if write_to_disk: csv_download_folder_path = write_to_disk else: csv_download_folder_path = mkdtemp() self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) self._login() report_download_url = interpret_report_url(self.base_url, report_url) self.log.debug( 'Getting report page at: {}'.format(report_download_url)) self.driver.get(report_download_url) # select all users and find the download button def check_for_export_button_enabled(driver, elem_select_all_locator, elem_export_locator): elem_select_all = driver.find_element(*elem_select_all_locator) if not elem_select_all.is_enabled(): return False elem_select_all.click() if not elem_select_all.is_selected(): return False elem_export = driver.find_element(*elem_export_locator) if elem_export.is_enabled() and elem_export.is_displayed(): return elem_export else: return False # have to use a lambda because until expects a callable elem_export = WebDriverWait( self.driver, self.wait_time).until(lambda x: check_for_export_button_enabled( self.driver, (By.NAME, "lexia-select-all"), (By.XPATH, "//button[contains(text(), 'Export')]"))) self.log.debug('Starting download of: '.format(report_download_url)) elem_export.click() wait_for_any_file_in_folder(csv_download_folder_path, "xls") self.log.debug('Download Finished.') df_report = pd.read_csv( get_most_recent_file_in_dir(csv_download_folder_path), sep='\t', **kwargs) # if the dataframe is empty (the report had no data), raise an error if df_report.shape[0] == 0: raise ValueError('No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, report_url))) self.driver.close() if not write_to_disk: shutil.rmtree(csv_download_folder_path) return df_report
def download_url_report(self, report_url, school_year, temp_folder_name=None, pandas_read_csv_kwargs={}): """ Downloads a SchoolMint data-stream-table report. Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. school_year (string): The SchoolMint school year to download from (e.g. '2018-2019') temp_folder_name (string): The name for a sub-directory in which the files from the browser will be temporarily stored. If this directory does not exist, it will be created. NOTE: This sub-directory will be pandas_read_csv_kwargs: additional arguments to pass to Pandas read_csv Returns: A Pandas DataFrame of the report contents. """ if temp_folder_name: csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name else: csv_download_folder_path = mkdtemp(dir=self.temp_folder_path) # set up the driver for execution self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) self._login() self._set_year(school_year, self.driver) # get the report url self.driver.get(interpret_report_url(self.base_url, report_url)) self.__remove_walk_me_and_support() # wait until we have rows in the stream data table before starting to # look for results elem = WebDriverWait(self.driver, self.wait_time).until( EC.presence_of_element_located( (By.XPATH, "//*[@id='stream-table']/tbody/tr[1]/td[1]"))) if not self.check_school_year(school_year): raise ReportNotFound( "Wrong school detected prior to clicking generate.") self.log.debug('Waiting for report-data-summary to load') # wait until the stream table is fully loaded before downloading prev_data_summary_elem = self.driver.find_element_by_id( 'report-data-summary').text # print(prev_data_summary_elem) time.sleep(1) # we use the following count as a proxy for time elapsed, so we can # use the class's wait_time as the number of retries count = 0 while True: # check id=report-data-summary report_data_summary_elem = self.driver.find_element_by_id( 'report-data-summary').text # if it matches, wait a little longer and double deck that it hasn't changed if prev_data_summary_elem == report_data_summary_elem: time.sleep(3) count += 3 report_data_summary_elem = self.driver.find_element_by_id( 'report-data-summary').text if prev_data_summary_elem == report_data_summary_elem: break prev_data_summary_elem = report_data_summary_elem time.sleep(1) count += 1 if count >= self.wait_time: raise TimeoutError( 'SchoolMint Report Data never did not fully load within %d' % self.wait_time) # click the button to download the report self.log.debug('Starting download...') elem = self.driver.find_element_by_class_name("export-table") elem.click() # wait until file has downloaded to close the browser. We can do this # because we delete the file before we return it, so the temp dir should # always be empty when this command is run wait_for_any_file_in_folder(csv_download_folder_path, "csv") self.log.debug('Download finished.') report_df = pd.read_csv( get_most_recent_file_in_dir(csv_download_folder_path), encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING, **pandas_read_csv_kwargs) # TODO: move this out of this function. It should happen as cleanup once # the whole DAG has completed #delete_folder_contents(csv_download_folder_path) shutil.rmtree(csv_download_folder_path) # close the driver for this task self.driver.close() # if the dataframe is empty (the report had no data), raise an error if report_df.shape[0] == 0: #delete_folder_contents(csv_download_folder_path) shutil.rmtree(csv_download_folder_path) raise ValueError('No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, report_url))) return report_df
def download_url_report(self, report_url, temp_folder_name): """ Downloads an Informed K12 report. Args: report_url (string): Information pertaining to the path and query string for the report whose access is desired. Any filtering that can be done with a stateful URL should be included. temp_folder_name (string): The name of the folder in which this specific report's download files should be stored. Returns: A Pandas DataFrame of the report contents. """ count = 0 while True: try: # WebDriverException - except csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name # set up the driver for execution self.driver = configure_selenium_chrome( csv_download_folder_path) self._login() time.sleep(2) #self.driver.get(self.base_url) # get the report url self.driver.get(interpret_report_url(self.base_url, report_url)) # select all responses # get the report url #self.driver.get(interpret_report_url(self.base_url, report_url)) # check to see if there are no submissions. If so, abort by exception try: self.driver.find_element_by_xpath( "//h2[contains(text(), 'No submissions')]") self.driver.close() raise ValueError( 'No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, report_url))) except NoSuchElementException: # We actually don't want to find this. pass # wait until we have rows in the responses data table before starting to # look for results try: elem = WebDriverWait(self.driver, self.wait_time).until( EC.presence_of_element_located(( By.XPATH, "//*[@class='responses-table']/table/thead/tr[1]/*[@class='checkboxes']/input" ))) except TimeoutException: raise # select all elem.click() # check to see if a new link populates to 'select all filtered submissions" (happens if more than 50 submissions) try: elem = self.driver.find_element_by_xpath( "//*[@class='responses-bulk-actions']/*[@class='select-link']" ) elem.click() except NoSuchElementException(): pass # click download elem = self.driver.find_element_by_xpath( "//*[contains(text(), 'Download') and @class='hidden-xs']") elem.click() # click 'As a spreadsheet' elem = self.driver.find_element_by_xpath( "//*[@class='dropdown-menu dropdown-menu-right']//*[contains(text(), 'As a spreadsheet')]" ) elem.click() # activate the menu that allows 'select all' try: # the following elem selection fails b/c is moves, so we time.sleep to let it load first time.sleep(0.5) elem = WebDriverWait(self.driver, self.wait_time).until( EC.visibility_of_element_located(( By.XPATH, "//*[@class='dropdown-toggle']/*[contains(text(), 'columns')]/i" ))) elem.click() except TimeoutException: # TODO raise # click on 'select all' elem = self.driver.find_element_by_xpath( "//*[@class='dropdown-menu dropdown-menu-right']//*[contains(text(), 'Select all')]" ) elem.click() # wait a moment for the info to populate time.sleep(2) # click download # elem = self.driver.find_element_by_xpath( # "//*[@class='btn btn-primary' and contains(text(), 'Download')]") # elem.click() # # time.sleep(1) # try: # elem = self.driver.find_element_by_xpath( # "//*[@class='btn btn-primary' and contains(text(), 'Download')]") # elem.click() # except WebDriverException: # pass c = 0 while True: try: elem = self.driver.find_element_by_xpath( "//*[@class='btn btn-primary' and contains(text(), 'Download')]" ) elem.click() except NoSuchElementException: if c >= 9: raise time.sleep(1) c += 1 continue break # wait until file has downloaded to close the browser. We can do this # because we delete the file before we return it, so the temp dir should # always be empty when this command is run # TODO add a try/except block here wait_for_any_file_in_folder(csv_download_folder_path, 'csv') report_df = pd.read_csv( get_most_recent_file_in_dir(csv_download_folder_path)) # delete any files in the mealtime temp folder; we don't need them now # TODO: move this out of this function. It should happen as cleanup once # the whole DAG has completed delete_folder_contents(csv_download_folder_path) self.driver.close() except WebDriverException: if count >= 9: raise count += 1 self.driver.close() continue break return report_df
def download_data_shared_with_application(self, application_page_url, collection, write_to_disk=None, **kwargs): """ Downloads the students shared with a particular application through Clever. :param application_page_url: The url for the main Clever management page for a particular application. For example, for My Lexia, this would be https://schools.clever.com/applications/lexia-mylexia :param collection: A string of 'schools', 'students', 'sections', 'teachers', 'schooladmins' that indicates which shared data to download :param write_to_disk: A path to a directory where the downloaded CSV should be saved. If nothing is passed, it will not be saved and only a Pandas DataFrame will be returned. :param kwargs: Additional keyword arguments to be passed to the Pandas read_csv function. :return: A Pandas DataFrame of the indicated collection download. """ collection = collection.lower().replace(' ', '') if collection not in ['schools', 'students', 'sections', 'teachers', 'schooladmins']: raise ReportNotFound( ( "Argument for collection '{collection}' is not a valid. Please choose from: " "'schools', 'students', 'sections', 'teachers', 'schooladmins'." ).format(collection=collection) ) report_access_page_url = interpret_report_url(self.base_url, application_page_url) if write_to_disk: csv_download_folder_path = write_to_disk else: csv_download_folder_path = mkdtemp() self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless) self._login() self.log.debug('Getting report access page at: {}'.format(report_access_page_url)) self.driver.get(report_access_page_url) # find and click the download button based on the collection desired elem = WebDriverWait(self.driver, self.wait_time).until( EC.presence_of_element_located( (By.XPATH, "//a[contains(@href, '{collection}.csv')]".format(collection=collection)) ) ) self.log.info('Starting download of: {} - {}'.format(report_access_page_url, collection)) elem.click() wait_for_any_file_in_folder(csv_download_folder_path, "csv") self.log.info('Download Finished.') df_report = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path), **kwargs) # if the dataframe is empty (the report had no data), raise an error if df_report.shape[0] == 0 and collection != 'schooladmins': raise ValueError('No data in report for user {} at url: {}'.format( self.username, interpret_report_url(self.base_url, application_page_url))) elif df_report.shape[0] == 0: warnings.warn("The 'schooladmins' collection has no data. Ensure that no school admins are shared.") self.driver.close() if not write_to_disk: shutil.rmtree(csv_download_folder_path) return df_report