예제 #1
0
 def parse_page(self):
     ret_text = []
     tgt_url = self.report_base + self.pagelink
     page_response = requests.get(tgt_url)
     if page_response.status_code == 200:
         soup = BeautifulSoup(page_response.text, 'html.parser')
     else:
         raise Exception(tgt_url, page_response.status_code, page_response.text)
     main_content = soup.findAll('div', {'class': 'internal-box2-inner'})[0]
     parent = self.make_parent()
     for paragraph in main_content.findAll('p')[0].findAll(text=True):
         if ServiceMessage.is_valid_message(paragraph):
             ret_text.append(ServiceMessage(text=paragraph,
                                            parent=parent).pre_processed_text)
     return sorted(ret_text)
예제 #2
0
 def process_message(raw_status_report):
     message = ServiceMessage(raw_status_report)
     return message.extract_full_data()
def test_line_color_extract(message_text):
    proc_message = ServiceMessage(message_text)
    return proc_message.extract_color_reference(), proc_message.pre_processed_text
예제 #4
0
        in_soup = BeautifulSoup(response.text, "html.parser")
    else:
        raise Exception(METRO_ARCHIVES, response.status_code, response.text)

    ret_vals = []

    links = in_soup.findAll('a')

    for html_link in links:
        if 'href' in html_link.attrs and html_link.attrs['href'].find(REPORT_LINK_SIGNATURE) > -1:
            ret_vals.append(DatedLink(pagelink=html_link.attrs['href'],
                                      datetext=html_link.text,
                                      report_base=REPORT_BASE))
    return ret_vals


if __name__ == "__main__":
    out_data = []
    report_page_links = gather_dated_links()

    for link in report_page_links:
        print(link)
        for item in link.parse_page():
            cline = ServiceMessage(item, parent=link.make_parent())
            full_data = cline.extract_full_data()
            full_data['event_dtg'] = full_data['event_dtg'].isoformat()
            out_data.append(full_data)
    out_file = open('out.json', 'w')
    out_file.write(json.dumps(out_data))
    out_file.close()