Python SolusParser.update_html示例

编程语言: Python

命名空间/包名称: parser1

类/类型: SolusParser

方法/功能: update_html

hotexamples.com的示例: 2

Python SolusParser.update_html - 已找到2个示例。这些是从开源项目中提取的最受好评的parser1.SolusParser.update_html现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

SolusParser(1)

update_html(1)

示例#1

显示文件

文件： navigation.py 项目： NSegal/FSUCourseScraper

class SolusSession(object):
    """Represents a solus browsing session"""

    login_url = "http://cas.fsu.edu/cas/login?service=https://my.fsu.edu" #changed to fsu page
    continue_url = "https://campus.omni.fsu.edu/psc/sprdcs/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_BROWSE_CATLG_P.GBL?Page=SSS_BROWSE_CATLG&Action=U" #not sure if this is the right redirect page
    course_catalog_url = "https://campus.omni.fsu.edu/psc/sprdcs/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_BROWSE_CATLG_P.GBL?Page=SSS_BROWSE_CATLG&Action=U"

    def __init__(self, user=None, password=None):
        self.session = requests.session()

        # Use SSL version 1
        self.session.mount('https://', SSLAdapter(ssl_version=ssl.PROTOCOL_TLSv1))

        # Parser
        self._parser1 = SolusParser()
        self._update_parser1 = False

        # Response data
        self.latest_response = None
        self.latest_text = None

        # Recover from errors
        self.recovery_state = -1 #State of recovery ( < 0 is not recovering, otherwise the current recovery level)
        self.recovery_stack = [None, None, None, None, None] #letter, subj subject, course, term, section

        # Authenticate and navigate to course catalog
        logging.info("Logging in...")
        self.login(user, password)

        logging.info("Navigating to course catalog...")
        self.go_to_course_catalog()

        
        print(self.parser1.print_title())

        #Showint the HTTP Status Code
        self.status_code = requests.get(self.login_url).status_code


        # Should now be on the course catalog page. If not, something went wrong
        if self.latest_response.url != self.course_catalog_url:
            # SOLUS Doesn't like requests v2.1.0 (getting error 999, unsupported OS)
            # Seems to be a quirk of it. The headers don't matter (even user-agent)
            # Sticking with v2.0.1 until the issue is resolved
            raise EnvironmentError("Authenticated, but couldn't access the FSU course catalog.")



    @property
    def parser1(self):
        """Updates the parser1 with new HTML (if needed) and returns it"""
        if self._update_parser1:
            self._parser1.update_html(self.latest_text)
            self._update_parser1 = False
        return self._parser1

    def login(self, user, password):
        """Logs into the site"""

        # Decode base64'd password
        password = base64.b64decode(password)

        # Load the access page to set all the cookies and get redirected
        self._get(self.login_url)

        vars_list = self.parser1.get_input_vars()
        lt_value = vars_list[0]
        execution_value = vars_list[1]

        x = randint(0,9)
        y = randint(0,9)

        # Login procedure is different when JS is disabled
        payload = {
           'username': user,
           'password': password,
           'lt': lt_value,
           'execution': execution_value,
           '_eventId': 'submit',
           'submit.x': x,
           'submit.y': y
        }

        print('Logging in...')

        self._post(self.latest_response.url, data=payload)




        # Check for the continue page
        #if self.continue_url in self.latest_response.url:
        #    self.do_continue_page()


        # Should now be authenticated and on the my.queensu.ca page, submit a request for the URL in the 'SOLUS' button

        link = self.parser1.login_solus_link()
        if not link:
            # Not on the right page
            raise EnvironmentError("Could not authenticate with the FSU's Student Central system. The login credentials provided may have been incorrect.")

        logging.info("Sucessfully authenticated.")
        # Have to actually use this link to access SOLUS initially otherwise it asks for login again
        self._get(link)

        # The request could (seems 50/50 from browser tests) bring up another continue page
        if self.continue_url in self.latest_response.url:
            self.do_continue_page()

        # Should now be logged in and on the student center page

    def do_continue_page(self):
        """
        The SSO system returns a specific page only if JS is disabled
        It has you click a Continue button which submits a form with some hidden values
        """
        data = self.parser1.login_continue_page()
        if not data:
            return
        self._post(data["url"], data=data["payload"])

    def go_to_course_catalog(self):
        self._catalog_post("")
        self.select_alphanum("A")

    # ----------------------------- Alphanums ------------------------------------ #

    def select_alphanum(self, alphanum):
        """Navigates to a letter/number"""
        logging.debug(u"Selecting letter {0}".format(alphanum))
        self._catalog_post(u'DERIVED_SSS_BCC_SSR_ALPHANUM_{0}'.format(alphanum.upper()))

        if self.recovery_state < 0:
            self.recovery_stack[0] = alphanum

    # ----------------------------- Subjects ------------------------------------- #

    def dropdown_subject(self, subject_unique):
        """Opens the dropdown menu for a subject"""
        logging.debug(u"Dropping down subject with unique '{0}'".format(subject_unique))

        action = self.parser1.subject_action(subject_unique)
        if not action:
            raise Exception(u"Tried to drop down an invalid subject unique '{0}'".format(subject_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[1] = subject_unique

    def rollup_subject(self, subject_unique):
        """Closes the dropdown menu for a subject"""
        logging.debug(u"Rolling up subject with a unique '{0}'".format(subject_unique))

        action = self.parser1.subject_action(subject_unique)
        if not action:
            raise Exception(u"Tried to roll up an invalid subject unique '{0}'".format(subject_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[1] = None

    # ----------------------------- Courses ------------------------------------- #

    def open_course(self, course_unique):
        """Opens a course page"""
        logging.debug(u"Opening course with unique '{0}'".format(course_unique))

        action = self.parser1.course_action(course_unique)
        if not action:
            raise Exception(u"Tried to open a course with an invalid unique '{0}'".format(course_unique))
        
        self._catalog_post(action)
        
        #attempt to go one level deeper to deal with courses which have multiple 'careers'
        secondaryAction = self.parser1.disambiguation_action()
        
        if secondaryAction:
            logging.error(u"POSTING: {0}".format(secondaryAction))
            self._catalog_post(secondaryAction)
        
        # unsure if this still works 
        if self.recovery_state < 0:
            self.recovery_stack[2] = course_unique

    def return_from_course(self):
        """Navigates back from course to subject"""
        logging.debug("Returning from a course")
        #hacky, attempt to return from the disambiguation page first 
        self._catalog_post('DERIVED_SAA_CRS_RETURN_PB')
        self._catalog_post('DERIVED_SSS_SEL_RETURN_PB')

        self.recovery_stack[3] = None
        self.recovery_stack[2] = None

    # -----------------------------Sections ------------------------------------- #

    def show_sections(self):
        """Clicks on the 'View class sections' button on the course page if it exists"""
        action = self.parser1.show_sections_action()

        if action:
            logging.debug("Pressing the 'View class sections' button")
            self._catalog_post(action)

    def switch_to_term(self, term_unique):
        """Shows the sections for the term"""
        logging.debug(u"Switching to term with unique '{0}'".format(term_unique))
        value = self.parser1.term_value(term_unique)

        self._catalog_post(action='DERIVED_SAA_CRS_SSR_PB_GO$98$', extras={'DERIVED_SAA_CRS_TERM_ALT': value})

        if self.recovery_state < 0:
            self.recovery_stack[3] = term_unique

    def view_all_sections(self):
        """Presses the "view all sections" link on the course page if needed"""
        action = self.parser1.view_all_action()

        if action:
            logging.debug("Pressing the 'View all' button for sections")
            self._catalog_post(action)

    def visit_section_page(self, section_unique):
        """
        Opens the dedicated page for the provided section unique.
        Used for deep scrapes
        """
        logging.debug(u"Visiting section page for section with unique '{0}'".format(section_unique))

        action = self.parser1.section_action(section_unique)
        if not action:
            raise Exception(u"Tried to open a section with an invalid unique '{0}'".format(section_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[4] = section_unique

    def return_from_section(self):
        """
        Navigates back from section to course.
        Used for deep scrapes
        """
        logging.debug("Returning from section page")
        self._catalog_post('CLASS_SRCH_WRK2_SSR_PB_CLOSE')
        self.recovery_stack[4] = None

    # -----------------------------General Purpose------------------------------------- #


    def _get(self, url, **kwargs):
        self.latest_response = self._request_with_retries(getattr(self.session, 'get'), url, **kwargs)
        self._update_attrs()


    def _post(self, url, **kwargs):
        self.latest_response = self._request_with_retries(getattr(self.session, 'post'), url, **kwargs)
        self._update_attrs()

    '''def _bs4_login(self):

        br = mechanize.Browser()

        #br.open(login_url)

        login_data = urllib.urlencode(self.login.payload)

        binary_data = login_data.encode('ascii')

        cj = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

        print(self.login_url)
        print(self.login.payload)

        resp = opener.open(self.login_url, binary_data)

        print resp.read()
        print ('DONE!!!, will redirect...')'''


    def _request_with_retries(self, method, *args, **kwargs):
        result = None
        attempts = 0
        while attempts <= MAX_RETRIES:
            attempts += 1
            try:
                result = method(*args, **kwargs)
                break
            except (ConnectionError):
                if attempts <= MAX_RETRIES:
                    logging.warning("ConnectionError, attempt {0} of {1}".format(attempts,MAX_RETRIES))
                    sleep(RETRY_SLEEP_SECONDS)
                else:
                    logging.critical("ConnectionError, reached maxium number of retries.")
                    raise
        return result


    def _update_attrs(self):
        self.latest_text = self.latest_response.text

        # The parser1 requires an update
        self._update_parser1 = True

    def _catalog_post(self, action, extras=None):
        """Submits a post request to the site"""
        if extras is None:
            extras = {}
        extras['ICAction'] = action
        self._post(self.course_catalog_url, data=extras)

        #import random
        # TODO: Improve this, could easily give false positives
        if "Data Integrity Error" in self.latest_text:
            self._recover(action, extras)

        # TESTING - Fake a DIE using random number generator
        #elif action != "" and random.random() < 0.1:
        #    self._get(self.course_catalog_url)
        #    self._recover(action, extras)

    def _recover(self, action, extras):
        """Attempts to recover the scraper state after encountering an error"""

        # Don't recurse, retry
        if self.recovery_state >= 0:
            logging.warning("Error while recovering, retrying")
            self.recovery_state = 0
            return

        # Number of non-null elements in the recovery stack
        num_states = len(self.recovery_stack) - self.recovery_stack.count(None)

        # Start recovery process
        logging.warning("Encounted SOLUS Data Integrety Error, attempting to recover")
        self.recovery_state = 0

        while self.recovery_state < num_states:

            # Has to be done before the recovery operations
            self.recovery_state += 1

            # State numbers are OBO due to previous increment
            if self.recovery_state == 1:
                self.select_alphanum(self.recovery_stack[0])
            elif self.recovery_state == 2:
                self.dropdown_subject(self.recovery_stack[1])
            elif self.recovery_state == 3:
                self.open_course(self.recovery_stack[2])
                self.show_sections()
            elif self.recovery_state == 4:
                self.switch_to_term(self.recovery_stack[3])
                self.view_all_sections()
            elif self.recovery_state == 5:
                self.visit_section_page(self.recovery_stack[4])

        # Finished recovering
        self.recovery_state = -1
        logging.warning("Recovered, retrying original request")

        self._catalog_post(action, extras)

示例#2

显示文件

文件： navigation.py 项目： NSegal/FSUCourseScraper

class SolusSession(object):
    """Represents a solus browsing session"""

    login_url = "http://cas.fsu.edu/cas/login?service=https://my.fsu.edu"  #changed to fsu page
    continue_url = "https://campus.omni.fsu.edu/psc/sprdcs/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_BROWSE_CATLG_P.GBL?Page=SSS_BROWSE_CATLG&Action=U"  #not sure if this is the right redirect page
    course_catalog_url = "https://campus.omni.fsu.edu/psc/sprdcs/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_BROWSE_CATLG_P.GBL?Page=SSS_BROWSE_CATLG&Action=U"

    def __init__(self, user=None, password=None):
        self.session = requests.session()

        # Use SSL version 1
        self.session.mount('https://',
                           SSLAdapter(ssl_version=ssl.PROTOCOL_TLSv1))

        # Parser
        self._parser1 = SolusParser()
        self._update_parser1 = False

        # Response data
        self.latest_response = None
        self.latest_text = None

        # Recover from errors
        self.recovery_state = -1  #State of recovery ( < 0 is not recovering, otherwise the current recovery level)
        self.recovery_stack = [None, None, None, None, None
                               ]  #letter, subj subject, course, term, section

        # Authenticate and navigate to course catalog
        logging.info("Logging in...")
        self.login(user, password)

        logging.info("Navigating to course catalog...")
        self.go_to_course_catalog()

        print(self.parser1.print_title())

        #Showint the HTTP Status Code
        self.status_code = requests.get(self.login_url).status_code

        # Should now be on the course catalog page. If not, something went wrong
        if self.latest_response.url != self.course_catalog_url:
            # SOLUS Doesn't like requests v2.1.0 (getting error 999, unsupported OS)
            # Seems to be a quirk of it. The headers don't matter (even user-agent)
            # Sticking with v2.0.1 until the issue is resolved
            raise EnvironmentError(
                "Authenticated, but couldn't access the FSU course catalog.")

    @property
    def parser1(self):
        """Updates the parser1 with new HTML (if needed) and returns it"""
        if self._update_parser1:
            self._parser1.update_html(self.latest_text)
            self._update_parser1 = False
        return self._parser1

    def login(self, user, password):
        """Logs into the site"""

        # Decode base64'd password
        password = base64.b64decode(password)

        # Load the access page to set all the cookies and get redirected
        self._get(self.login_url)

        vars_list = self.parser1.get_input_vars()
        lt_value = vars_list[0]
        execution_value = vars_list[1]

        x = randint(0, 9)
        y = randint(0, 9)

        # Login procedure is different when JS is disabled
        payload = {
            'username': user,
            'password': password,
            'lt': lt_value,
            'execution': execution_value,
            '_eventId': 'submit',
            'submit.x': x,
            'submit.y': y
        }

        print('Logging in...')

        self._post(self.latest_response.url, data=payload)

        # Check for the continue page
        #if self.continue_url in self.latest_response.url:
        #    self.do_continue_page()

        # Should now be authenticated and on the my.queensu.ca page, submit a request for the URL in the 'SOLUS' button

        link = self.parser1.login_solus_link()
        if not link:
            # Not on the right page
            raise EnvironmentError(
                "Could not authenticate with the FSU's Student Central system. The login credentials provided may have been incorrect."
            )

        logging.info("Sucessfully authenticated.")
        # Have to actually use this link to access SOLUS initially otherwise it asks for login again
        self._get(link)

        # The request could (seems 50/50 from browser tests) bring up another continue page
        if self.continue_url in self.latest_response.url:
            self.do_continue_page()

        # Should now be logged in and on the student center page

    def do_continue_page(self):
        """
        The SSO system returns a specific page only if JS is disabled
        It has you click a Continue button which submits a form with some hidden values
        """
        data = self.parser1.login_continue_page()
        if not data:
            return
        self._post(data["url"], data=data["payload"])

    def go_to_course_catalog(self):
        self._catalog_post("")
        self.select_alphanum("A")

    # ----------------------------- Alphanums ------------------------------------ #

    def select_alphanum(self, alphanum):
        """Navigates to a letter/number"""
        logging.debug(u"Selecting letter {0}".format(alphanum))
        self._catalog_post(u'DERIVED_SSS_BCC_SSR_ALPHANUM_{0}'.format(
            alphanum.upper()))

        if self.recovery_state < 0:
            self.recovery_stack[0] = alphanum

    # ----------------------------- Subjects ------------------------------------- #

    def dropdown_subject(self, subject_unique):
        """Opens the dropdown menu for a subject"""
        logging.debug(
            u"Dropping down subject with unique '{0}'".format(subject_unique))

        action = self.parser1.subject_action(subject_unique)
        if not action:
            raise Exception(
                u"Tried to drop down an invalid subject unique '{0}'".format(
                    subject_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[1] = subject_unique

    def rollup_subject(self, subject_unique):
        """Closes the dropdown menu for a subject"""
        logging.debug(
            u"Rolling up subject with a unique '{0}'".format(subject_unique))

        action = self.parser1.subject_action(subject_unique)
        if not action:
            raise Exception(
                u"Tried to roll up an invalid subject unique '{0}'".format(
                    subject_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[1] = None

    # ----------------------------- Courses ------------------------------------- #

    def open_course(self, course_unique):
        """Opens a course page"""
        logging.debug(
            u"Opening course with unique '{0}'".format(course_unique))

        action = self.parser1.course_action(course_unique)
        if not action:
            raise Exception(
                u"Tried to open a course with an invalid unique '{0}'".format(
                    course_unique))

        self._catalog_post(action)

        #attempt to go one level deeper to deal with courses which have multiple 'careers'
        secondaryAction = self.parser1.disambiguation_action()

        if secondaryAction:
            logging.error(u"POSTING: {0}".format(secondaryAction))
            self._catalog_post(secondaryAction)

        # unsure if this still works
        if self.recovery_state < 0:
            self.recovery_stack[2] = course_unique

    def return_from_course(self):
        """Navigates back from course to subject"""
        logging.debug("Returning from a course")
        #hacky, attempt to return from the disambiguation page first
        self._catalog_post('DERIVED_SAA_CRS_RETURN_PB')
        self._catalog_post('DERIVED_SSS_SEL_RETURN_PB')

        self.recovery_stack[3] = None
        self.recovery_stack[2] = None

    # -----------------------------Sections ------------------------------------- #

    def show_sections(self):
        """Clicks on the 'View class sections' button on the course page if it exists"""
        action = self.parser1.show_sections_action()

        if action:
            logging.debug("Pressing the 'View class sections' button")
            self._catalog_post(action)

    def switch_to_term(self, term_unique):
        """Shows the sections for the term"""
        logging.debug(
            u"Switching to term with unique '{0}'".format(term_unique))
        value = self.parser1.term_value(term_unique)

        self._catalog_post(action='DERIVED_SAA_CRS_SSR_PB_GO$98$',
                           extras={'DERIVED_SAA_CRS_TERM_ALT': value})

        if self.recovery_state < 0:
            self.recovery_stack[3] = term_unique

    def view_all_sections(self):
        """Presses the "view all sections" link on the course page if needed"""
        action = self.parser1.view_all_action()

        if action:
            logging.debug("Pressing the 'View all' button for sections")
            self._catalog_post(action)

    def visit_section_page(self, section_unique):
        """
        Opens the dedicated page for the provided section unique.
        Used for deep scrapes
        """
        logging.debug(
            u"Visiting section page for section with unique '{0}'".format(
                section_unique))

        action = self.parser1.section_action(section_unique)
        if not action:
            raise Exception(
                u"Tried to open a section with an invalid unique '{0}'".format(
                    section_unique))

        self._catalog_post(action)

        if self.recovery_state < 0:
            self.recovery_stack[4] = section_unique

    def return_from_section(self):
        """
        Navigates back from section to course.
        Used for deep scrapes
        """
        logging.debug("Returning from section page")
        self._catalog_post('CLASS_SRCH_WRK2_SSR_PB_CLOSE')
        self.recovery_stack[4] = None

    # -----------------------------General Purpose------------------------------------- #

    def _get(self, url, **kwargs):
        self.latest_response = self._request_with_retries(
            getattr(self.session, 'get'), url, **kwargs)
        self._update_attrs()

    def _post(self, url, **kwargs):
        self.latest_response = self._request_with_retries(
            getattr(self.session, 'post'), url, **kwargs)
        self._update_attrs()

    '''def _bs4_login(self):

        br = mechanize.Browser()

        #br.open(login_url)

        login_data = urllib.urlencode(self.login.payload)

        binary_data = login_data.encode('ascii')

        cj = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

        print(self.login_url)
        print(self.login.payload)

        resp = opener.open(self.login_url, binary_data)

        print resp.read()
        print ('DONE!!!, will redirect...')'''

    def _request_with_retries(self, method, *args, **kwargs):
        result = None
        attempts = 0
        while attempts <= MAX_RETRIES:
            attempts += 1
            try:
                result = method(*args, **kwargs)
                break
            except (ConnectionError):
                if attempts <= MAX_RETRIES:
                    logging.warning(
                        "ConnectionError, attempt {0} of {1}".format(
                            attempts, MAX_RETRIES))
                    sleep(RETRY_SLEEP_SECONDS)
                else:
                    logging.critical(
                        "ConnectionError, reached maxium number of retries.")
                    raise
        return result

    def _update_attrs(self):
        self.latest_text = self.latest_response.text

        # The parser1 requires an update
        self._update_parser1 = True

    def _catalog_post(self, action, extras=None):
        """Submits a post request to the site"""
        if extras is None:
            extras = {}
        extras['ICAction'] = action
        self._post(self.course_catalog_url, data=extras)

        #import random
        # TODO: Improve this, could easily give false positives
        if "Data Integrity Error" in self.latest_text:
            self._recover(action, extras)

        # TESTING - Fake a DIE using random number generator
        #elif action != "" and random.random() < 0.1:
        #    self._get(self.course_catalog_url)
        #    self._recover(action, extras)

    def _recover(self, action, extras):
        """Attempts to recover the scraper state after encountering an error"""

        # Don't recurse, retry
        if self.recovery_state >= 0:
            logging.warning("Error while recovering, retrying")
            self.recovery_state = 0
            return

        # Number of non-null elements in the recovery stack
        num_states = len(self.recovery_stack) - self.recovery_stack.count(None)

        # Start recovery process
        logging.warning(
            "Encounted SOLUS Data Integrety Error, attempting to recover")
        self.recovery_state = 0

        while self.recovery_state < num_states:

            # Has to be done before the recovery operations
            self.recovery_state += 1

            # State numbers are OBO due to previous increment
            if self.recovery_state == 1:
                self.select_alphanum(self.recovery_stack[0])
            elif self.recovery_state == 2:
                self.dropdown_subject(self.recovery_stack[1])
            elif self.recovery_state == 3:
                self.open_course(self.recovery_stack[2])
                self.show_sections()
            elif self.recovery_state == 4:
                self.switch_to_term(self.recovery_stack[3])
                self.view_all_sections()
            elif self.recovery_state == 5:
                self.visit_section_page(self.recovery_stack[4])

        # Finished recovering
        self.recovery_state = -1
        logging.warning("Recovered, retrying original request")

        self._catalog_post(action, extras)