Пример #1
0
class Lexia(WebUIDataSource, LoggingMixin):
    """ Class for interacting with the web ui of Lexia
    """
    def __init__(self,
                 username,
                 password,
                 wait_time,
                 hostname,
                 temp_folder_path=None,
                 headless=False,
                 lexia_school_year_start_date=None,
                 district_export_email_address=None,
                 district_export_email_password=None,
                 district_export_email_imap_uri=None,
                 district_export_email_folder='Lexia District Exports',
                 district_export_email_wait_time=600,
                 district_export_email_retry_frequency=30,
                 district_id=None):
        super().__init__(username, password, wait_time, hostname,
                         temp_folder_path, headless)
        self.lexia_school_year_start_date = lexia_school_year_start_date
        self.district_export_email_address = district_export_email_address
        self.district_export_email_password = district_export_email_password
        self.district_export_email_imap_uri = district_export_email_imap_uri
        self.district_export_email_folder = district_export_email_folder
        self.district_export_email_wait_time = district_export_email_wait_time
        self.district_export_email_retry_frequency = district_export_email_retry_frequency
        self.district_id = district_id
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + 'www.' + self.hostname

    def _login(self):
        """ Logs into the provided Lexia instance.
        """
        login_url = self.uri_scheme + 'auth.mylexia.com/mylexiaLogin'
        self.log.debug('Logging into Lexia at: {}'.format(login_url))
        self.driver.get(login_url)
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.ID, 'username')))

        elem.clear()
        elem.send_keys(self.username)
        elem.send_keys(Keys.RETURN)
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.ID, 'login-password')))
        elem.send_keys(self.password)
        elem.send_keys(Keys.RETURN)

        # ensure that login is successful
        self.driver.get(self.base_url)

        if 'Welcome' in self.driver.title:
            self.driver.close()
            raise InvalidLoginCredentials

    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Lexia report at a URL for a page with an 'export' button.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        # if user is trying to download a manage tab report (for convenience)
        if '/mylexiaweb/app/index.html#/groups/' in report_download_url:
            return self.download_manage_tab_report(report_url, write_to_disk,
                                                   **kwargs)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # find and click the download button
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//button[contains(text(), 'Export')]")))

        self.log.debug('Starting download of: '.format(report_download_url))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xlsx")
        self.log.debug('Downloada Finished.')

        df_report = pd.read_excel(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_manage_tab_report(self,
                                   report_url,
                                   write_to_disk=None,
                                   **kwargs):
        """ Downloads a Lexia report from the 'Manage' tab.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_csv

        Returns: A Pandas DataFrame of the report contents.
        """
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        report_download_url = interpret_report_url(self.base_url, report_url)
        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # select all users and find the download button
        def check_for_export_button_enabled(driver, elem_select_all_locator,
                                            elem_export_locator):
            elem_select_all = driver.find_element(*elem_select_all_locator)
            if not elem_select_all.is_enabled():
                return False
            elem_select_all.click()
            if not elem_select_all.is_selected():
                return False
            elem_export = driver.find_element(*elem_export_locator)
            if elem_export.is_enabled() and elem_export.is_displayed():
                return elem_export
            else:
                return False

        # have to use a lambda because until expects a callable
        elem_export = WebDriverWait(
            self.driver,
            self.wait_time).until(lambda x: check_for_export_button_enabled(
                self.driver, (By.NAME, "lexia-select-all"),
                (By.XPATH, "//button[contains(text(), 'Export')]")))
        self.log.debug('Starting download of: '.format(report_download_url))
        elem_export.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xls")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            sep='\t',
            **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_district_export_core5_monthly(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='export',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def download_district_export_core5_year_to_date(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='expytd',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def download_district_export_powerup_year_to_date(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='pupytd',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def _download_district_export(self,
                                  report_type,
                                  period_end_date,
                                  period_start_date=None,
                                  write_to_disk=None,
                                  pandas_read_csv_kwargs={}):
        if not period_start_date:
            period_start_date = self.lexia_school_year_start_date
        self.__request_district_export(report_type, period_start_date,
                                       period_end_date)

        df_report = None
        number_retries = int(self.district_export_email_wait_time /
                             self.district_export_email_retry_frequency)
        for retry_count in range(0, number_retries):
            if retry_count > 0:
                time.sleep(self.district_export_email_retry_frequency)
            self.log.info(
                str(self.district_id) + ': get export_id from email, try: ' +
                str(retry_count))
            try:
                export_id = self.__get_exportid_from_email()
            except ValueError as err:
                self.log.debug(err)
                self.log.warning(
                    '{}: No export_id found in email, retrying in {} seconds.'.
                    format(self.district_id,
                           self.district_export_email_retry_frequency))
                time.sleep(self.district_export_email_retry_frequency)
                continue

            try:
                df_report = self.__download_export_for_exportid(
                    export_id, write_to_disk, pandas_read_csv_kwargs)
                break
            except NoDataError as e:
                self.log.warning('{}: {} Retrying in {} seconds.'.format(
                    self.district_id, e,
                    self.district_export_email_retry_frequency))
        if df_report is None:
            raise ReportNotFound(
                'No email was received with report id. Make sure the emails are not going to spam.'
            )
        else:
            return df_report

    def __request_district_export(self,
                                  report_type,
                                  period_start_date=None,
                                  period_end_date=None,
                                  write_to_disk=None):
        """
        Logs into Lexia and submits the request to generate a district export
        :param report_type: The text from one of 'Report type' options listed in the myLexia
            'District Exports' modal.
        :param period_start_date: The start date for the report request (unsure if this actually
            affects the data returned if it is different from the school year start date set
            for your Lexia instance)
        :param period_end_date: The end date for the report request (unsure if this actually
            affects the data returned if it is different from the day on which the request is made)
        :param write_to_disk: The path to save the CSV to.
        :return: Boolean. Whether or not the export request was successful.
        """
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = self.temp_folder_path
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        # use requests to post the download request
        with requests.Session() as s:
            for cookie in self.driver.get_cookies():
                s.cookies.set(cookie['name'], cookie['value'])

            payload = {
                "districtID": self.district_id,
                "type": report_type,
                "email": self.district_export_email_address,
                "startDate": period_start_date.strftime("%Y-%m-%d"),
                "endDate": period_end_date.strftime("%Y-%m-%d")
            }
            self.log.info('{}: Export request payload: {}'.format(
                self.district_id, payload))
            download_response = s.put(self.base_url + '/exportData/progress',
                                      data=payload)

            if download_response.ok:
                self.log.info(
                    '{}: Export request for {} succeeded for user: {}'.format(
                        self.district_id, report_type, self.username))
                j_data = json.loads(download_response.content.decode())
                self.log.info(j_data)
                return True
            else:
                self.log.info(
                    '{}: Export request for {} FAILED  for user: {}'.format(
                        self.district_id, report_type, self.username))
                self.log.info(download_response.content)
                return False

    def __get_exportid_from_email(self):
        """Log into an IMAP email server and get messages in a specific folder.
        Checks for a new Lexia export_id in those messages.

        Returns:
            int: the export_id
        """
        self.log.info('Checking email for latest report ID for district_id: ' +
                      str(self.district_id))
        imap_conn = imaplib.IMAP4_SSL(self.district_export_email_imap_uri)

        try:
            imap_conn.login(self.district_export_email_address,
                            self.district_export_email_password)
        except imaplib.IMAP4.error:
            self.log.error('Email login failed for: ' +
                           self.district_export_email_address)
            sys.exit(1)

        rv, data = imap_conn.select('"{}"'.format(
            self.district_export_email_folder))
        if rv == 'OK':
            self.log.info('Processing mailbox for ' +
                          self.district_export_email_address + ' in folder "' +
                          self.district_export_email_folder + '"')
            export_id = self.__extract_lexia_export_id_from_email(imap_conn)
            if export_id == -1:
                raise ValueError('No new export_id found on ' +
                                 self.district_export_email_address)
            else:
                imap_conn.close()
                return export_id

        else:
            raise InvalidIMAPParameters(
                "ERROR: Unable to open mailbox. Check your parameters and email folder. Message: ",
                rv)
            imap_conn.logout()

    def __extract_lexia_export_id_from_email(self, imap_conn):
        """ Extract the export_id that is sent by Lexia that is needed to
        download the prepared report export.

        Email messages in Gmail aren't sorted can can't be sorted using
        regular IMAP functions (Gmail does not support them). Therefore
        we will search within the folder for messages in the last day.

        Args:
            imap_conn (imaplib.IMAP4_SSL): A current connection to an IMAP
                email account.

        Returns:
            int: The new export_id
        """
        # get all messages received in the last day
        rv, data = imap_conn.search(
            None, '(SINCE ' +
            (dt.datetime.now() - dt.timedelta(1)).strftime("%d-%b-%Y") + ')')
        if rv != 'OK':
            self.log.warning("No email messages found!")
            # TODO change this to raise an error
            return -1

        highest_export_id = -1
        for num in data[0].split():
            rv, data = imap_conn.fetch(num, '(RFC822)')
            if rv != 'OK':
                # TODO change this to raise an error
                self.log.error("ERROR getting email message", num)
                return -1

            msg = email.message_from_bytes(data[0][1])
            self.log.debug('Processing Message %s, Raw Date: %s' %
                           (num, msg['Date']))
            highest_export_id = 0
            for part in msg.walk():
                # each part is a either non-multipart, or another multipart message
                # that contains further parts... Message is organized like a tree
                if part.get_content_type() == 'text/plain':
                    # get the raw text
                    part_str = part.get_payload()
                    # extract the report id
                    match = re.search(r'(?<=id=)(\d*?)(?=\s)', part_str)
                    if match:
                        export_id = int(match.group(0))
                        self.log.debug('export_id found: ' + str(export_id))
                        if export_id > highest_export_id:
                            highest_export_id = export_id
                    else:
                        return -1

        return highest_export_id

    def __download_export_for_exportid(self,
                                       export_id,
                                       write_to_disk=None,
                                       pandas_read_csv_kwargs={}):
        """Logs into lexia and downloads the report associated with a specific
        export_id.

        Args:
            export_id (int): The Lexia export id to download.
            write_to_disk (str): A path where the CSV that has been downloaded should be written to disk.
            pandas_read_csv_kwargs (dict): kwargs to pass to the Pandas read_csv function as necessary
        Returns:
            A Pandas dataframe with the report contents
        """
        self.log.info(
            str(self.district_id) + ': downloading report with export_id=' +
            str(export_id))
        with requests.Session() as s:
            for cookie in self.driver.get_cookies():
                s.cookies.set(cookie['name'], cookie['value'])

            export_url = self.base_url + '/reports/get_export.php' + '?id=' + str(
                export_id)
            download_response = s.get(export_url)

            self.log.info(
                'Report download request response for export_id {}: {}'.format(
                    export_id, download_response.content))

            if download_response.ok:
                df_report = pd.read_csv(
                    io.StringIO(
                        download_response.content.decode(LEXIA_CSV_ENCODING)),
                    **pandas_read_csv_kwargs)

                # if the dataframe is empty (the report had no data), raise an error
                if df_report.shape[0] == 0:
                    raise NoDataError(
                        'No data in report for user {} at url: {}'.format(
                            self.username, export_url))
            else:
                raise ValueError('Report download request failed')

        self.driver.close()

        if write_to_disk:
            df_report.to_csv(write_to_disk)

        return df_report
Пример #2
0
class Clever(WebUIDataSource, LoggingMixin):
    """ Class for interacting with the Clever Web UI
    """

    def __init__(self, username, password, wait_time, hostname='schools.clever.com',
                 temp_folder_path=None, headless=False):
        super().__init__(username, password, wait_time, hostname, temp_folder_path)
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + self.hostname
        self.headless = headless
        self.log.debug('creating instance of Clever')

    def _login(self):
        """ Logs into the provided Clever instance.
        """
        self.log.info('Logging into Clever instance: hostname, username: {}, {}'.format(
            self.hostname, self.username
        ))
        self.driver.get(self.base_url)
        # wait until login form available
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.NAME, 'username')))

        elem.clear()
        elem.send_keys(self.username)
        elem = self.driver.find_element_by_name("password")
        elem.send_keys(self.password)
        elem.send_keys(Keys.RETURN)

        # ensure that login is successful
        self.driver.get(self.base_url)

        if 'Clever | Home' not in self.driver.title:
            self.driver.close()
            raise InvalidLoginCredentials

    def download_url_report(self, report_url, collection, write_to_disk=None, **kwargs):
        """Currently a short cut for download_data_shared_with_application"""
        return self.download_data_shared_with_application(report_url, collection, write_to_disk, **kwargs)

    def download_data_shared_with_application(self, application_page_url, collection,
                                              write_to_disk=None, **kwargs):
        """
        Downloads the students shared with a particular application through Clever.
        :param application_page_url: The url for the main Clever management page for a
            particular application. For example, for My Lexia, this would be
            https://schools.clever.com/applications/lexia-mylexia
        :param collection: A string of 'schools', 'students', 'sections', 'teachers', 'schooladmins'
            that indicates which shared data to download
        :param write_to_disk: A path to a directory where the downloaded CSV should be saved.
            If nothing is passed, it will not be saved and only a Pandas DataFrame will be returned.
        :param kwargs: Additional keyword arguments to be passed to the Pandas read_csv function.
        :return: A Pandas DataFrame of the indicated collection download.
        """
        collection = collection.lower().replace(' ', '')
        if collection not in ['schools', 'students', 'sections', 'teachers', 'schooladmins']:
            raise ReportNotFound(
                (
                    "Argument for collection '{collection}' is not a valid. Please choose from: "
                    "'schools', 'students', 'sections', 'teachers', 'schooladmins'."
                ).format(collection=collection)
            )
        report_access_page_url = interpret_report_url(self.base_url, application_page_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        self._login()

        self.log.debug('Getting report access page at: {}'.format(report_access_page_url))
        self.driver.get(report_access_page_url)

        # find and click the download button based on the collection desired
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//a[contains(@href, '{collection}.csv')]".format(collection=collection))
            )
        )
        self.log.info('Starting download of: {} - {}'.format(report_access_page_url, collection))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.info('Download Finished.')

        df_report = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0 and collection != 'schooladmins':
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url, application_page_url)))
        elif df_report.shape[0] == 0:
            warnings.warn("The 'schooladmins' collection has no data. Ensure that no school admins are shared.")

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_google_accounts_manager_student_export(self):
        """ Downloads the Google Accounts Manager Student Export that includes student emails."""
        self.log.info('Starting student email download.')
        # set up the driver for execution
        self.driver = configure_selenium_chrome()
        self._login()

        # grab some cookies (need to do this here for _mkto_trk cookie)
        cookies_orig = self.driver.get_cookies()

        # open the Google Accounts Manager application page
        # note - clever applications like Google Accounts Manager have unique ids that are a part of their URL
        # note - we have to get the settings page of the Google Accounts Manager to get the cookie
        #  that we need in order to download the file
        self.driver.get('https://schools.clever.com/school/applications/50ca15a93bc2733956000007/settings')
        cookies_schools = self.driver.get_cookies()

        # we may need to get the gaprov.ops.clever.com to get a cookie in new versions of chromedriver
        self.driver.get('https://gaprov.ops.clever.com/')
        cookies_gaprov = self.driver.get_cookies()

        # create requests session to download report without need for file storage
        with requests.Session() as s:

            # transfer over a bunch of cookies to the requests session
            for cookie in cookies_orig:
                s.cookies.set(cookie['name'], cookie['value'])

            for cookie in cookies_schools:
                s.cookies.set(cookie['name'], cookie['value'])

            for cookie in cookies_gaprov:
                s.cookies.set(cookie['name'], cookie['value'])

            s.cookies.set('_gat', "1")
            s.cookies.set('_gat_globalTracker', "1")

            report_url = 'https://gaprov.ops.clever.com/reporting/student'

            # download with 10 retries on failure
            c = 0
            while True:
                download_response = s.get(report_url, stream=True)

                if download_response.ok:
                    df_report = pd.read_csv(io.StringIO(download_response.content.decode('utf-8')))
                else:
                    self.log.info('Download failed for report url: {}'.format(report_url))
                    self.log.info('Download status_code: {}'.format(download_response.status_code))
                    self.log.info('Retrying... Retry#: {}'.format(c+1))
                    if c >= 9:
                        raise ValueError('Unable to download report after multiple retries.')
                    # add some jitter to the requests
                    sleep_time = (1000 + randint(500)) / 1000
                    time.sleep(sleep_time)
                    c += 1
                    continue
                break
        self.driver.close()

        self.log.info('Student email download complete.')

        return df_report