示例#1
0
def parse_element_to_json(element):
    content = {}
    if element.img:
        content = {"type": "image", "content": element.img.get('src')}
        return content
    elif element.ul:
        content = {
            "type": "links",
            "content": [link.get('href') for link in element.ul.find_all('a')]
        }
        return content
    elif len(element.get_text().strip()) != 0:
        content = {"type": "text", "content": element.get_text().strip()}
        return content
示例#2
0
    def parse_news_item(self, item_element: element.Tag, base_url: str) -> NewsItem:
        title_element = item_element.find('h3')
        title_link = title_element.find('a')

        url = title_link['href']
        if not url:
            raise FormatError('No URL found for article')
        else:
            url = urljoin(base_url, url)

        title = title_link.get_text(strip=True)
        if not title:
            raise FormatError('No title content found')

        date_string = item_element.find(class_='date').get_text().strip()
        date_string = date_string.replace('Posted on: ', '')
        date = parse_datetime(date_string)

        categories = []
        for category_element in item_element.select('.category'):
            categories.append(category_element.get_text(strip=True))
            # Remove the link so we can more easily pull out summary text
            category_link = category_element.find_parent('a') or category_element
            category_link.extract()

        more_link = item_element.find(class_='more')
        if more_link:
            more_link.extract()

        # TODO: can/should we preserve HTML here?
        summaries = (element if isinstance(element, str) else element.get_text(strip=True)
                     for element in title_element.next_siblings)
        summaries = (SUMMARY_PREFIX_PATTERN.sub('', summary)
                     for summary in summaries)
        summary = ' '.join(summaries).strip()

        return NewsItem(id=url, url=url, title=title, date_published=date,
                        summary=summary)
 def is_news_heading(self, element: element.Tag) -> bool:
     return (HEADING_PATTERN.match(element.name) and
             MONTH_HEADING_PATTERN.match(element.get_text())) is not None
示例#4
0
 def is_news_heading(self, element: element.Tag) -> bool:
     return bool(element.name == 'h3'
                 and MONTH_HEADING_PATTERN.match(element.get_text()))