def _parse_search_item(self, html1: _Element, html2: _Element, html3: _Element, metadata: dict) -> Optional[ChinaCdcItem]: title = get_element_str(html1) url = html1.attrib['href'] if not title: print('') abstract = get_element_str(html2) try: publish_str = html3.text.strip() matches = self.publish_time_pattern.findall(publish_str) if not matches: raise Exception( f'Failed to parse publish datetime from {publish_str}') publish = datetime.datetime(int(matches[0][1]), int(matches[0][2]), int(matches[0][3])) except: return None item = ChinaCdcItem() item.title = title item.url = url item.keyword = metadata.get('keyword', '') item.abstract = abstract item.publish = publish return item
def _parse_search_item(self, html: _Element, metadata: dict) -> Optional[ChinaNewsItem]: element = html.xpath('.//li[contains(@class, "news_title")]/a')[0] title = utility.get_element_str(element) url = element.attrib['href'] abstract = utility.get_element_str( html.xpath('.//li[@class="news_content"]')[0]) try: element = html.xpath('.//li[@class="news_other"]') element = element[0] publish_str = element.text.strip() publish_str = publish_str.split('\t')[-1] publish = datetime.datetime.strptime(publish_str, '%Y-%m-%d %H:%M:%S') except: return None item = ChinaNewsItem() item.title = title item.url = url item.keyword = metadata.get('keyword', '') item.abstract = abstract item.publish = publish return item
def _parse_search_item(self, html: _Element, metadata: dict) -> Optional[CnrItem]: element = html.xpath('div[1]/a')[0] title = utility.get_element_str(element) url = element.attrib['href'] element = html.xpath('div[2]') abstract = '' if element: abstract = utility.get_element_str(element[0]) try: element = html.xpath('div/span[@class="searchresulturl"]') element = element[0] publish_str = element.tail.strip() publish = datetime.datetime.strptime(publish_str, '%Y.%m.%d %H:%M:%S') except: return None item = CnrItem() item.title = title item.url = url item.abstract = abstract item.keyword = metadata.get('keyword', '') item.publish = publish return item
def _parse_search_item(self, html: _Element, metadata: dict) -> Optional[GovItem]: elements = html.xpath('h3/a') if not elements: raise Exception(f'Failed to parse item') element = elements[0] title = get_element_str(element) url = element.attrib['href'] if not title: print('') abstract = '' abstract_elements = html.xpath('p[@class="res-sub"]') if abstract_elements: abstract = get_element_str(abstract_elements[0]) try: element = html.xpath('.//p[@class="res-other"]/span') element = element[0] publish_str = element.text.strip() matches = self.publish_time_pattern.findall(publish_str) if not matches: raise Exception( f'Failed to parse publish datetime from {publish_str}') publish = datetime.datetime(int(matches[0][1]), int(matches[0][2]), int(matches[0][3])) except: return None item = GovItem() item.title = title item.url = url item.keyword = metadata.get('keyword', '') item.abstract = abstract item.publish = publish return item