示例#1
0
    def scrape_senate_plenary():
        index_url = "http://www.eerstekamer.nl/planning_plenaire_vergaderingen"
        
        try:
            document = lxml.html.parse(retrieve_if_not_exists(index_url, bypass_cache=bypass_cache)).getroot()
            document.make_links_absolute(index_url)
        except urllib2.HTTPError:
            error_on_retrieval.append(index_url)
            return
            
        for element in document.xpath("//a[contains(@href, '/plenaire_vergadering/')]"):
            date_string = element.text.strip()
            date, start_date, end_date = date_string_to_datetime(date_string)

            assembly_detail_url = element.get("href")
            
            try:
                document = lxml.html.parse(retrieve_if_not_exists(assembly_detail_url, bypass_cache=bypass_cache)).getroot()
                document.make_links_absolute(assembly_detail_url)
            except urllib2.HTTPError:
                error_on_retrieval.append(assembly_detail_url)
                continue
                
            # Remove the footer and various other irrelevant elements
            map(lambda element: element.getparent().remove(element), document.cssselect("#footer_menu")[0].getprevious().itersiblings())
            details_raw = "".join([lxml.etree.tostring(element) for element in document.cssselect("h1")[0].itersiblings()])
            
            # Add to database
            update_or_create_assembly({
                "type": "plenary",
                "url": assembly_detail_url,
                "date": int(time.mktime(date.timetuple())),
                "start_time": int(time.mktime(start_date.timetuple())) if start_date else None,
                "end_time": int(time.mktime(end_date.timetuple())) if end_date else None,
                "parlisnumber": None,
                "house": "senate",
                "status": None,
                "is_public": None,
                "location": None,
                "variety": None,
                "committee": None,
                "summary": None,
                "details_raw": details_raw,
            })
            assembly_urls.append(assembly_detail_url)
示例#2
0
 def scrape_senate_committee():
     committees_index_url = "http://www.eerstekamer.nl/commissies"
     
     try:
         document = lxml.html.parse(retrieve_if_not_exists(committees_index_url, bypass_cache=bypass_cache)).getroot()
         document.make_links_absolute(committees_index_url)
     except urllib2.HTTPError:
         error_on_retrieval.append(committees_index_url)
         return
     
     for element in document.xpath(".//a[contains(@href, '/commissies/')]"):
         committee_name = element.text
         
         # Retrieve the individual page for each committee
         committee_page_url =  element.get("href")
         
         try:
             document = lxml.html.parse(retrieve_if_not_exists(committee_page_url, bypass_cache=bypass_cache)).getroot()
             document.make_links_absolute(committee_page_url)
         except urllib2.HTTPError:
             error_on_retrieval.append(committee_page_url)
             continue
         
         committee_code = posixpath.basename(urlparse.urlparse(committee_page_url).path)
         
         # Find the link pointing to the events listing for this committee
         committee_activities_url = document.xpath("//a[contains(@href, '/planning_activiteiten_commissie')]/@href")[0]
         committee_key = urlparse.parse_qs(urlparse.urlparse(committee_activities_url).query)["key"][0]
         
         # Scrape the events listing
         document = lxml.html.parse(retrieve_if_not_exists(committee_activities_url, bypass_cache=bypass_cache)).getroot()
         document.make_links_absolute(committee_activities_url)
         
         for element in document.xpath("//a[contains(@href, '/commissievergadering/')]"):
             date_string = element.text.strip()
             date, start_date, end_date = date_string_to_datetime(date_string)
             
             # Retrieve the details for this meeting
             assembly_detail_url = element.get("href")
             
             try:
                 document = lxml.html.parse(retrieve_if_not_exists(assembly_detail_url, bypass_cache=bypass_cache)).getroot()
                 document.make_links_absolute(assembly_detail_url)
             except urllib2.HTTPError:
                 error_on_retrieval.append(assembly_detail_url)
                 continue
             
             # Clean up the details (remove the footer etc.) and grab the details
             map(lambda element: element.getparent().remove(element), document.cssselect("#footer_menu")[0].getprevious().itersiblings())
             details_raw = "".join([lxml.etree.tostring(element) for element in document.cssselect("h1")[0].itersiblings()])
             
             # Store the entry in the database
             update_or_create_assembly({
                 "url": assembly_detail_url,
                 "type": "committee",
                 "date": int(time.mktime(date.timetuple())),
                 "start_time": int(time.mktime(start_date.timetuple())) if start_date else None,
                 "end_time": int(time.mktime(end_date.timetuple())) if end_date else None,
                 "parlisnumber": None,
                 "house": "senate",
                 "status": None,
                 "is_public": None,
                 "location": None,
                 "variety": None,
                 "committee": committee_page_url,
                 "summary": None,
                 "details_raw": details_raw,
             })
             assembly_urls.append(assembly_detail_url)
示例#3
0
    def __init__(
            self,
            cohort,
            initial_state_df,
            transitions_df,
            total_steps,

            initial_state_column='state_id',
            initial_state_distribution_column='distribution',
            initial_state_count_column='count',
            initial_state_time_step_column='time_step',

            fit_data=False,
            cohort_column='cohort',
            old_state_id_column='old_state_id',
            new_state_id_column='new_state_id',
            transition_function_column='transition_function',
            args_column='args',
            xdata_column=None,
            ydata_column='transition_probability',
            ydata_sigma_column='transition_sigma',
            args_initial_guess_column='args_initial_guess',
            args_bounds_column='args_bounds',
            allow_fit_column='allow_fit',

            markov_transition_function_column='markov_transition_function_column',
            time_step_interval='month',
            date_column='date'
    ):
        self.cohort = cohort
        self.initial_state_df = initial_state_df
        self.transitions_df = transitions_df
        self.total_steps = total_steps

        self.initial_state_column = initial_state_column
        self.initial_state_distribution_column = initial_state_distribution_column
        self.initial_state_count_column = initial_state_count_column
        self.initial_state_time_step_column = initial_state_time_step_column

        self.fit_data = fit_data
        self.cohort_column = cohort_column
        self.old_state_id_column = old_state_id_column
        self.new_state_id_column = new_state_id_column
        self.transition_function_column = transition_function_column
        self.args_column = args_column
        self.xdata_column = xdata_column
        self.ydata_column = ydata_column
        self.ydata_sigma_column = ydata_sigma_column
        self.args_initial_guess_column = args_initial_guess_column
        self.args_bounds_column = args_bounds_column
        self.allow_fit_column = allow_fit_column

        self.markov_transition_function_column = markov_transition_function_column
        self.time_step_interval = time_step_interval
        self.date_column = date_column

        # check to see if we have more than one cohort, if so raise an error
        if len(self.initial_state_df[self.cohort_column].unique()) != 1 or \
                len(self.transitions_df[self.cohort_column].unique()) != 1:
            raise ValueError('MarkovChain object passed dataframe with more than one unique cohort')

        # now let's check to make sure the states spaces are the same for each of the inputs
        self.state_id_list = self.initial_state_df[self.initial_state_column].unique()
        if set(self.state_id_list) != set(self.transitions_df[self.old_state_id_column].unique()) or \
                set(self.state_id_list) != set(self.transitions_df[self.new_state_id_column].unique()):
            raise ValueError(
                'unique states in initial_state_df and transitions_df for cohort={} not equal to each other'
                .format(self.cohort)
            )

        # now we create the MarkovStateSpace object
        self.markov_state_space = MarkovStateSpace(state_id_list=self.state_id_list)

        # then we create the transition matrix df by cohort (index = (cohort, old_state_id), columns = new_state_id)
        self.transition_matrix_df = pd.pivot_table(
            self.transitions_df,
            values=self.markov_transition_function_column,
            index=[self.cohort_column, self.old_state_id_column],
            columns=[self.new_state_id_column],
            aggfunc=lambda x: x,  # we don't actually want to aggregate anything, we're just exploiting the pivot table
        )

        # then we create the MarkovTransitionMatrix
        self.markov_transition_matrix = MarkovTransitionMatrix(transition_matrix_df=self.transition_matrix_df)

        # now we figure out the size of the initial state (i.e. the total number of things in all the states)
        self.cohort_size = self.initial_state_df[self.initial_state_count_column].sum()

        # now we grab the time step
        self.time_step_set = set(self.initial_state_df[self.initial_state_time_step_column].values)
        if len(self.time_step_set) != 1:
            raise ValueError(
                'the initial time step has multiple values for the same cohort, cohort={}'.format(self.cohort)
            )
        self.initial_state_time_step = self.time_step_set.pop()

        # and the initial date
        self.initial_state_date = helpers.add_interval_to_date(
            date_object=helpers.date_string_to_datetime(self.cohort),
            steps=self.initial_state_time_step,
            interval=self.time_step_interval,
        )

        # and then we create the state_distribution_df (index=(cohort, state_id), column=distribution)
        self.state_distribution_df = self.initial_state_df.rename(
            columns={self.initial_state_column: self.old_state_id_column}  # rename the state_id column to old_state_id
        )

        self.state_distribution_df[self.date_column] = self.initial_state_date

        self.state_distribution_df = self.state_distribution_df.set_index(
            # set the index to cohort, date, time_step, old_state_id
            [self.cohort_column, self.date_column, self.initial_state_time_step_column, self.old_state_id_column]
        )[[self.initial_state_distribution_column, self.initial_state_count_column]]

        # grab the matrix at the initial time step
        self.initial_markov_transition_matrix = self.markov_transition_matrix.matrix_at_time_step(
            self.initial_state_time_step
        )

        # and create the state_transition_df for the initial time_step
        self.state_transition_df = helpers.join_vector_to_df_on_index_and_multiply_across_rows(
            self.state_distribution_df, self.initial_markov_transition_matrix, self.initial_state_distribution_column
        )

        # finally, let's create the MarkovStateVector object
        self.markov_state_vector = MarkovStateVector(
            cohort=self.cohort,
            state_space=self.markov_state_space,
            state_distribution_df=self.state_distribution_df,
            cohort_column=self.cohort_column,
            old_state_id_column=self.old_state_id_column,
            time_step_column=self.initial_state_time_step_column,
            date_column=self.date_column,
            distribution_column=self.initial_state_distribution_column,
            count_column=self.initial_state_count_column,
            markov_transition_matrix_df=self.initial_markov_transition_matrix,
            state_transition_df=self.state_transition_df,
            cohort_size=self.cohort_size,
            time_step=self.initial_state_time_step,
            time_step_interval=self.time_step_interval
        )

        self.history = [self.markov_state_vector]  # initialize the history with the initial state

        # then we have everything we need to calculate the current state and log the history along the way
        self.current_state = self.state_after_n_steps(
            self.markov_state_vector, self.total_steps, log_history=True
        )

        self.state_distribution_history_df = self.state_distribution_history()
        self.state_transition_history_df = self.state_transition_history()