def parse_page(self): ret_text = [] tgt_url = self.report_base + self.pagelink page_response = requests.get(tgt_url) if page_response.status_code == 200: soup = BeautifulSoup(page_response.text, 'html.parser') else: raise Exception(tgt_url, page_response.status_code, page_response.text) main_content = soup.findAll('div', {'class': 'internal-box2-inner'})[0] parent = self.make_parent() for paragraph in main_content.findAll('p')[0].findAll(text=True): if ServiceMessage.is_valid_message(paragraph): ret_text.append(ServiceMessage(text=paragraph, parent=parent).pre_processed_text) return sorted(ret_text)
def process_message(raw_status_report): message = ServiceMessage(raw_status_report) return message.extract_full_data()
def test_line_color_extract(message_text): proc_message = ServiceMessage(message_text) return proc_message.extract_color_reference(), proc_message.pre_processed_text
in_soup = BeautifulSoup(response.text, "html.parser") else: raise Exception(METRO_ARCHIVES, response.status_code, response.text) ret_vals = [] links = in_soup.findAll('a') for html_link in links: if 'href' in html_link.attrs and html_link.attrs['href'].find(REPORT_LINK_SIGNATURE) > -1: ret_vals.append(DatedLink(pagelink=html_link.attrs['href'], datetext=html_link.text, report_base=REPORT_BASE)) return ret_vals if __name__ == "__main__": out_data = [] report_page_links = gather_dated_links() for link in report_page_links: print(link) for item in link.parse_page(): cline = ServiceMessage(item, parent=link.make_parent()) full_data = cline.extract_full_data() full_data['event_dtg'] = full_data['event_dtg'].isoformat() out_data.append(full_data) out_file = open('out.json', 'w') out_file.write(json.dumps(out_data)) out_file.close()