def __getEntries(self, url: str): """ 获取义项列表。 """ try: ret = rq.get(url, headers=self.__header, timeout=self.__setup['timeout']) except rq.exceptions.Timeout: stderr.write('超时错误:' + url + ';' + 'HTTP状态码:' + str(ret.status_code) + '\n') return '' doc = html.fromstring(ret.text) #现在我们在第一个义项页面里 self.__getTitles(doc) #获取义项列表 self.entrylist = doc.xpath( "//ul[@class='polysemantList-wrapper cmn-clearfix']//li/*") #如果义项列表是空的,说明这是个单义词,为其添加标题 if self.entrylist == []: self.entrylist = [html.HtmlElement()] self.entrylist[0].text = self.title + '\n' + self.subtitle #为使得第i个索引指向第i个义项,需要添加一个dummy0号义项在entrylist里 self.entrylist = [html.HtmlElement()] + self.entrylist #为能返回正确的url,对其他url添加头部 for i in range(len(self.entrylist)): if self.entrylist[i].attrib.has_key('href'): self.entrylist[i].attrib[ 'href'] = "https://baike.baidu.com" + self.entrylist[ i].attrib['href'] else: #没有href属性的是当前义项,为它加一个url self.entrylist[i].attrib['href'] = url #对no进行处理 #如果no是单个整数,把它变成列表 if isinstance(self.__setup['no'], int): self.__setup['no'] = [self.__setup['no'], [0]] #获取第no[0]号义项的内容 if self.__setup['no'][0] != 0: return self.__getParagraph(self.entrylist[self.__regularize( self.__setup['no'][0], len(self.entrylist))].attrib['href']) elif self.__setup['no'][0] == 0: #如果no[0]是0那么说明要求显示义项列表 entries = '' self.title = self.__setup['keyword'] for i in range(1, len(self.entrylist)): entries += str(i) + ':' + self.entrylist[i].text + '\n' #处理完毕,拼接结果 return self.title + '\n' + entries
def _create_element(element, text=None): # creates lxml element without document tree (no body, no parents) new_element = html.HtmlElement() new_element.tag = element if text: new_element.text = text return new_element
def __getEntries(self, url: str): """ 获取义项列表。 """ try: ret = rq.get(url, headers=self.__header, timeout=self.__setup['timeout']) except rq.exceptions.Timeout: stderr.write('超时错误:' + url + ';' + 'HTTP状态码:' + str(ret.status_code) + '\n') return '' doc = html.fromstring(ret.text) #获取义项列表 self.entrylist = doc.xpath( "//ul[@class='polysemantList-wrapper cmn-clearfix']//li/*") #如果义项列表是空的,说明这是个单义词,为其添加标题 if self.entrylist == []: self.entrylist = [html.HtmlElement()] self.entrylist[0].text = self.title #为能返回正确的url,对其他url添加头部 for i in range(len(self.entrylist)): if self.entrylist[i].attrib.has_key('href'): self.entrylist[i].attrib[ 'href'] = "https://baike.baidu.com" + self.entrylist[ i].attrib['href'] else: #没有href属性的是当前义项,为它加一个url self.entrylist[i].attrib['href'] = url #对no进行处理 if self.__setup['no'] != 0: if self.__setup['no'] > len(self.entrylist): self.__setup['no'] = len(self.entrylist) elif self.__setup['no'] < -len(self.entrylist) + 1: self.__setup['no'] = -len(self.entrylist) + 1 #处理完毕,no指向的entrylist一定有 return self.__getDescription(self.entrylist[self.__setup['no'] - 1].attrib['href']) elif self.__setup['no'] == 0: #如果no是0那么说明要求显示义项列表 entries = '' self.title = self.__setup['keyword'] for i in range(len(self.entrylist)): entries += str(i + 1) + ':' + self.entrylist[i].text + '\n' #处理完毕,拼接结果 text = self.title + '\n' + entries return text
def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): if element.tag in ['ol', 'ul']: # ignore any spaces between <ul> and <li> element.text = '' elif element.tag == 'li': # ignore spaces after </li> element.tail = '' elif element.tag == 'iframe': iframe_src = element.get('src') youtube = youtube_re.match(iframe_src) vimeo = vimeo_re.match(iframe_src) telegram = telegram_embed_iframe_re.match(iframe_src) if youtube or vimeo or telegram: element.text = '' # ignore any legacy text if youtube: yt_id = urlparse(iframe_src).path.replace('/embed/', '') element.set( 'src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id)) elif vimeo: element.set( 'src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) elif telegram: element.set( 'src', '/embed/telegram?url=' + quote_plus(iframe_src)) if not len(element.xpath('./ancestor::figure')): _wrap_figure(element) else: element.drop_tag() elif element.tag == 'blockquote' and element.get( 'class') == 'twitter-tweet': twitter_links = element.xpath('.//a[@href]') for tw_link in twitter_links: if twitter_re.match(tw_link.get('href')): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set( 'src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) element.addprevious(twitter_frame) _wrap_figure(twitter_frame) element.drop_tree() break
def __init__(self, url: str, domain_url: str, date: datetime.datetime): self.url = url self.title = str() self.author = str() self.date = date self.dir_html = str() self.section = Section() self.element = html.HtmlElement() # Load section section_url = self.__find_section_url(url, domain_url) self.__load_section(section_url) # Load source and element self.element = self.section.domain.obtain_element(self.url) # Load informations about news self.__load_title() self.__load_author()
def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): if element.tag == 'figcaption': # figcaption may have only text content [e.drop_tag() for e in element.findall('*')] elif element.tag in ['ol', 'ul']: # ignore any spaces between <ul> and <li> element.text = '' elif element.tag == 'li': # ignore spaces after </li> element.tail = '' elif element.tag == 'iframe' and element.get('src'): iframe_src = element.get('src') youtube = youtube_re.match(iframe_src) vimeo = vimeo_re.match(iframe_src) if youtube or vimeo: if youtube: yt_id = urlparse(iframe_src).path.replace('/embed/', '') element.set( 'src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id)) elif vimeo: element.set( 'src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) element = _wrap_tag(element, 'figure') elif element.tag == 'blockquote' and element.get( 'class') == 'twitter-tweet': twitter_links = element.cssselect('a') for tw_link in twitter_links: if twitter_re.match(tw_link.get('href')): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set( 'src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) element = _wrap_tag(twitter_frame, 'figure') return element
def convert_html_to_telegraph_format(html_string, clean_html=True): if clean_html: html_string = clean_article_html(html_string) fragments = preprocess_fragments(html.fragments_fromstring(html_string)) content = [] for fragment in fragments: if fragment.tag not in allowed_top_level_tags: paragraph = html.HtmlElement() paragraph.tag = 'p' paragraph.append(fragment) content.append(_recursive_convert(paragraph)) else: content.append(_recursive_convert(fragment)) # convert and append text nodes after closing tag if fragment.tail and len(fragment.tail.strip()) != 0: content.append( _recursive_convert( html.fromstring('<p>%s</p>' % fragment.tail))) return json.dumps(content, ensure_ascii=False)
def _wrap_tag(element, wrapper): new_element = html.HtmlElement() new_element.tag = wrapper new_element.append(element) return new_element