def scrape_senate_plenary(): index_url = "http://www.eerstekamer.nl/planning_plenaire_vergaderingen" try: document = lxml.html.parse(retrieve_if_not_exists(index_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(index_url) except urllib2.HTTPError: error_on_retrieval.append(index_url) return for element in document.xpath("//a[contains(@href, '/plenaire_vergadering/')]"): date_string = element.text.strip() date, start_date, end_date = date_string_to_datetime(date_string) assembly_detail_url = element.get("href") try: document = lxml.html.parse(retrieve_if_not_exists(assembly_detail_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(assembly_detail_url) except urllib2.HTTPError: error_on_retrieval.append(assembly_detail_url) continue # Remove the footer and various other irrelevant elements map(lambda element: element.getparent().remove(element), document.cssselect("#footer_menu")[0].getprevious().itersiblings()) details_raw = "".join([lxml.etree.tostring(element) for element in document.cssselect("h1")[0].itersiblings()]) # Add to database update_or_create_assembly({ "type": "plenary", "url": assembly_detail_url, "date": int(time.mktime(date.timetuple())), "start_time": int(time.mktime(start_date.timetuple())) if start_date else None, "end_time": int(time.mktime(end_date.timetuple())) if end_date else None, "parlisnumber": None, "house": "senate", "status": None, "is_public": None, "location": None, "variety": None, "committee": None, "summary": None, "details_raw": details_raw, }) assembly_urls.append(assembly_detail_url)
def scrape_senate_committee(): committees_index_url = "http://www.eerstekamer.nl/commissies" try: document = lxml.html.parse(retrieve_if_not_exists(committees_index_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(committees_index_url) except urllib2.HTTPError: error_on_retrieval.append(committees_index_url) return for element in document.xpath(".//a[contains(@href, '/commissies/')]"): committee_name = element.text # Retrieve the individual page for each committee committee_page_url = element.get("href") try: document = lxml.html.parse(retrieve_if_not_exists(committee_page_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(committee_page_url) except urllib2.HTTPError: error_on_retrieval.append(committee_page_url) continue committee_code = posixpath.basename(urlparse.urlparse(committee_page_url).path) # Find the link pointing to the events listing for this committee committee_activities_url = document.xpath("//a[contains(@href, '/planning_activiteiten_commissie')]/@href")[0] committee_key = urlparse.parse_qs(urlparse.urlparse(committee_activities_url).query)["key"][0] # Scrape the events listing document = lxml.html.parse(retrieve_if_not_exists(committee_activities_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(committee_activities_url) for element in document.xpath("//a[contains(@href, '/commissievergadering/')]"): date_string = element.text.strip() date, start_date, end_date = date_string_to_datetime(date_string) # Retrieve the details for this meeting assembly_detail_url = element.get("href") try: document = lxml.html.parse(retrieve_if_not_exists(assembly_detail_url, bypass_cache=bypass_cache)).getroot() document.make_links_absolute(assembly_detail_url) except urllib2.HTTPError: error_on_retrieval.append(assembly_detail_url) continue # Clean up the details (remove the footer etc.) and grab the details map(lambda element: element.getparent().remove(element), document.cssselect("#footer_menu")[0].getprevious().itersiblings()) details_raw = "".join([lxml.etree.tostring(element) for element in document.cssselect("h1")[0].itersiblings()]) # Store the entry in the database update_or_create_assembly({ "url": assembly_detail_url, "type": "committee", "date": int(time.mktime(date.timetuple())), "start_time": int(time.mktime(start_date.timetuple())) if start_date else None, "end_time": int(time.mktime(end_date.timetuple())) if end_date else None, "parlisnumber": None, "house": "senate", "status": None, "is_public": None, "location": None, "variety": None, "committee": committee_page_url, "summary": None, "details_raw": details_raw, }) assembly_urls.append(assembly_detail_url)
def __init__( self, cohort, initial_state_df, transitions_df, total_steps, initial_state_column='state_id', initial_state_distribution_column='distribution', initial_state_count_column='count', initial_state_time_step_column='time_step', fit_data=False, cohort_column='cohort', old_state_id_column='old_state_id', new_state_id_column='new_state_id', transition_function_column='transition_function', args_column='args', xdata_column=None, ydata_column='transition_probability', ydata_sigma_column='transition_sigma', args_initial_guess_column='args_initial_guess', args_bounds_column='args_bounds', allow_fit_column='allow_fit', markov_transition_function_column='markov_transition_function_column', time_step_interval='month', date_column='date' ): self.cohort = cohort self.initial_state_df = initial_state_df self.transitions_df = transitions_df self.total_steps = total_steps self.initial_state_column = initial_state_column self.initial_state_distribution_column = initial_state_distribution_column self.initial_state_count_column = initial_state_count_column self.initial_state_time_step_column = initial_state_time_step_column self.fit_data = fit_data self.cohort_column = cohort_column self.old_state_id_column = old_state_id_column self.new_state_id_column = new_state_id_column self.transition_function_column = transition_function_column self.args_column = args_column self.xdata_column = xdata_column self.ydata_column = ydata_column self.ydata_sigma_column = ydata_sigma_column self.args_initial_guess_column = args_initial_guess_column self.args_bounds_column = args_bounds_column self.allow_fit_column = allow_fit_column self.markov_transition_function_column = markov_transition_function_column self.time_step_interval = time_step_interval self.date_column = date_column # check to see if we have more than one cohort, if so raise an error if len(self.initial_state_df[self.cohort_column].unique()) != 1 or \ len(self.transitions_df[self.cohort_column].unique()) != 1: raise ValueError('MarkovChain object passed dataframe with more than one unique cohort') # now let's check to make sure the states spaces are the same for each of the inputs self.state_id_list = self.initial_state_df[self.initial_state_column].unique() if set(self.state_id_list) != set(self.transitions_df[self.old_state_id_column].unique()) or \ set(self.state_id_list) != set(self.transitions_df[self.new_state_id_column].unique()): raise ValueError( 'unique states in initial_state_df and transitions_df for cohort={} not equal to each other' .format(self.cohort) ) # now we create the MarkovStateSpace object self.markov_state_space = MarkovStateSpace(state_id_list=self.state_id_list) # then we create the transition matrix df by cohort (index = (cohort, old_state_id), columns = new_state_id) self.transition_matrix_df = pd.pivot_table( self.transitions_df, values=self.markov_transition_function_column, index=[self.cohort_column, self.old_state_id_column], columns=[self.new_state_id_column], aggfunc=lambda x: x, # we don't actually want to aggregate anything, we're just exploiting the pivot table ) # then we create the MarkovTransitionMatrix self.markov_transition_matrix = MarkovTransitionMatrix(transition_matrix_df=self.transition_matrix_df) # now we figure out the size of the initial state (i.e. the total number of things in all the states) self.cohort_size = self.initial_state_df[self.initial_state_count_column].sum() # now we grab the time step self.time_step_set = set(self.initial_state_df[self.initial_state_time_step_column].values) if len(self.time_step_set) != 1: raise ValueError( 'the initial time step has multiple values for the same cohort, cohort={}'.format(self.cohort) ) self.initial_state_time_step = self.time_step_set.pop() # and the initial date self.initial_state_date = helpers.add_interval_to_date( date_object=helpers.date_string_to_datetime(self.cohort), steps=self.initial_state_time_step, interval=self.time_step_interval, ) # and then we create the state_distribution_df (index=(cohort, state_id), column=distribution) self.state_distribution_df = self.initial_state_df.rename( columns={self.initial_state_column: self.old_state_id_column} # rename the state_id column to old_state_id ) self.state_distribution_df[self.date_column] = self.initial_state_date self.state_distribution_df = self.state_distribution_df.set_index( # set the index to cohort, date, time_step, old_state_id [self.cohort_column, self.date_column, self.initial_state_time_step_column, self.old_state_id_column] )[[self.initial_state_distribution_column, self.initial_state_count_column]] # grab the matrix at the initial time step self.initial_markov_transition_matrix = self.markov_transition_matrix.matrix_at_time_step( self.initial_state_time_step ) # and create the state_transition_df for the initial time_step self.state_transition_df = helpers.join_vector_to_df_on_index_and_multiply_across_rows( self.state_distribution_df, self.initial_markov_transition_matrix, self.initial_state_distribution_column ) # finally, let's create the MarkovStateVector object self.markov_state_vector = MarkovStateVector( cohort=self.cohort, state_space=self.markov_state_space, state_distribution_df=self.state_distribution_df, cohort_column=self.cohort_column, old_state_id_column=self.old_state_id_column, time_step_column=self.initial_state_time_step_column, date_column=self.date_column, distribution_column=self.initial_state_distribution_column, count_column=self.initial_state_count_column, markov_transition_matrix_df=self.initial_markov_transition_matrix, state_transition_df=self.state_transition_df, cohort_size=self.cohort_size, time_step=self.initial_state_time_step, time_step_interval=self.time_step_interval ) self.history = [self.markov_state_vector] # initialize the history with the initial state # then we have everything we need to calculate the current state and log the history along the way self.current_state = self.state_after_n_steps( self.markov_state_vector, self.total_steps, log_history=True ) self.state_distribution_history_df = self.state_distribution_history() self.state_transition_history_df = self.state_transition_history()