示例#1
0
    def __getEntries(self, url: str):
        """
        获取义项列表。
        """
        try:
            ret = rq.get(url,
                         headers=self.__header,
                         timeout=self.__setup['timeout'])
        except rq.exceptions.Timeout:
            stderr.write('超时错误:' + url + ';' + 'HTTP状态码:' +
                         str(ret.status_code) + '\n')
            return ''
        doc = html.fromstring(ret.text)

        #现在我们在第一个义项页面里
        self.__getTitles(doc)

        #获取义项列表
        self.entrylist = doc.xpath(
            "//ul[@class='polysemantList-wrapper cmn-clearfix']//li/*")
        #如果义项列表是空的,说明这是个单义词,为其添加标题
        if self.entrylist == []:
            self.entrylist = [html.HtmlElement()]
            self.entrylist[0].text = self.title + '\n' + self.subtitle
        #为使得第i个索引指向第i个义项,需要添加一个dummy0号义项在entrylist里
        self.entrylist = [html.HtmlElement()] + self.entrylist

        #为能返回正确的url,对其他url添加头部
        for i in range(len(self.entrylist)):
            if self.entrylist[i].attrib.has_key('href'):
                self.entrylist[i].attrib[
                    'href'] = "https://baike.baidu.com" + self.entrylist[
                        i].attrib['href']
            else:
                #没有href属性的是当前义项,为它加一个url
                self.entrylist[i].attrib['href'] = url

        #对no进行处理
        #如果no是单个整数,把它变成列表
        if isinstance(self.__setup['no'], int):
            self.__setup['no'] = [self.__setup['no'], [0]]
        #获取第no[0]号义项的内容
        if self.__setup['no'][0] != 0:
            return self.__getParagraph(self.entrylist[self.__regularize(
                self.__setup['no'][0], len(self.entrylist))].attrib['href'])
        elif self.__setup['no'][0] == 0:
            #如果no[0]是0那么说明要求显示义项列表
            entries = ''
            self.title = self.__setup['keyword']
            for i in range(1, len(self.entrylist)):
                entries += str(i) + ':' + self.entrylist[i].text + '\n'

            #处理完毕,拼接结果
            return self.title + '\n' + entries
示例#2
0
def _create_element(element, text=None):
    # creates lxml element without document tree (no body, no parents)
    new_element = html.HtmlElement()
    new_element.tag = element
    if text:
        new_element.text = text
    return new_element
示例#3
0
    def __getEntries(self, url: str):
        """
        获取义项列表。
        """
        try:
            ret = rq.get(url,
                         headers=self.__header,
                         timeout=self.__setup['timeout'])
        except rq.exceptions.Timeout:
            stderr.write('超时错误:' + url + ';' + 'HTTP状态码:' +
                         str(ret.status_code) + '\n')
            return ''
        doc = html.fromstring(ret.text)

        #获取义项列表
        self.entrylist = doc.xpath(
            "//ul[@class='polysemantList-wrapper cmn-clearfix']//li/*")
        #如果义项列表是空的,说明这是个单义词,为其添加标题
        if self.entrylist == []:
            self.entrylist = [html.HtmlElement()]
            self.entrylist[0].text = self.title

        #为能返回正确的url,对其他url添加头部
        for i in range(len(self.entrylist)):
            if self.entrylist[i].attrib.has_key('href'):
                self.entrylist[i].attrib[
                    'href'] = "https://baike.baidu.com" + self.entrylist[
                        i].attrib['href']
            else:
                #没有href属性的是当前义项,为它加一个url
                self.entrylist[i].attrib['href'] = url

        #对no进行处理
        if self.__setup['no'] != 0:
            if self.__setup['no'] > len(self.entrylist):
                self.__setup['no'] = len(self.entrylist)
            elif self.__setup['no'] < -len(self.entrylist) + 1:
                self.__setup['no'] = -len(self.entrylist) + 1
                #处理完毕,no指向的entrylist一定有
            return self.__getDescription(self.entrylist[self.__setup['no'] -
                                                        1].attrib['href'])
        elif self.__setup['no'] == 0:
            #如果no是0那么说明要求显示义项列表
            entries = ''
            self.title = self.__setup['keyword']
            for i in range(len(self.entrylist)):
                entries += str(i + 1) + ':' + self.entrylist[i].text + '\n'

            #处理完毕,拼接结果
            text = self.title + '\n' + entries
            return text
示例#4
0
def preprocess_media_tags(element):
    if isinstance(element, html.HtmlElement):
        if element.tag in ['ol', 'ul']:
            # ignore any spaces between <ul> and <li>
            element.text = ''
        elif element.tag == 'li':
            # ignore spaces after </li>
            element.tail = ''
        elif element.tag == 'iframe':
            iframe_src = element.get('src')

            youtube = youtube_re.match(iframe_src)
            vimeo = vimeo_re.match(iframe_src)
            telegram = telegram_embed_iframe_re.match(iframe_src)
            if youtube or vimeo or telegram:
                element.text = ''  # ignore any legacy text
                if youtube:
                    yt_id = urlparse(iframe_src).path.replace('/embed/', '')
                    element.set(
                        'src', '/embed/youtube?url=' +
                        quote_plus('https://www.youtube.com/watch?v=' + yt_id))
                elif vimeo:
                    element.set(
                        'src', '/embed/vimeo?url=' +
                        quote_plus('https://vimeo.com/' + vimeo.group(2)))
                elif telegram:
                    element.set(
                        'src', '/embed/telegram?url=' + quote_plus(iframe_src))
                if not len(element.xpath('./ancestor::figure')):
                    _wrap_figure(element)
            else:
                element.drop_tag()

        elif element.tag == 'blockquote' and element.get(
                'class') == 'twitter-tweet':
            twitter_links = element.xpath('.//a[@href]')
            for tw_link in twitter_links:
                if twitter_re.match(tw_link.get('href')):
                    twitter_frame = html.HtmlElement()
                    twitter_frame.tag = 'iframe'
                    twitter_frame.set(
                        'src', '/embed/twitter?url=' +
                        quote_plus(tw_link.get('href')))
                    element.addprevious(twitter_frame)
                    _wrap_figure(twitter_frame)
                    element.drop_tree()
                    break
示例#5
0
文件: news.py 项目: zhuoyr/webclipper
    def __init__(self, url: str, domain_url: str, date: datetime.datetime):
        self.url = url
        self.title = str()
        self.author = str()
        self.date = date
        self.dir_html = str()
        self.section = Section()
        self.element = html.HtmlElement()

        # Load section
        section_url = self.__find_section_url(url, domain_url)
        self.__load_section(section_url)

        # Load source and element
        self.element = self.section.domain.obtain_element(self.url)

        # Load informations about news
        self.__load_title()
        self.__load_author()
示例#6
0
def preprocess_media_tags(element):
    if isinstance(element, html.HtmlElement):
        if element.tag == 'figcaption':
            # figcaption may have only text content
            [e.drop_tag() for e in element.findall('*')]
        elif element.tag in ['ol', 'ul']:
            # ignore any spaces between <ul> and <li>
            element.text = ''
        elif element.tag == 'li':
            # ignore spaces after </li>
            element.tail = ''
        elif element.tag == 'iframe' and element.get('src'):
            iframe_src = element.get('src')
            youtube = youtube_re.match(iframe_src)
            vimeo = vimeo_re.match(iframe_src)
            if youtube or vimeo:
                if youtube:
                    yt_id = urlparse(iframe_src).path.replace('/embed/', '')
                    element.set(
                        'src', '/embed/youtube?url=' +
                        quote_plus('https://www.youtube.com/watch?v=' + yt_id))
                elif vimeo:
                    element.set(
                        'src', '/embed/vimeo?url=' +
                        quote_plus('https://vimeo.com/' + vimeo.group(2)))

                element = _wrap_tag(element, 'figure')
        elif element.tag == 'blockquote' and element.get(
                'class') == 'twitter-tweet':
            twitter_links = element.cssselect('a')
            for tw_link in twitter_links:
                if twitter_re.match(tw_link.get('href')):
                    twitter_frame = html.HtmlElement()
                    twitter_frame.tag = 'iframe'
                    twitter_frame.set(
                        'src', '/embed/twitter?url=' +
                        quote_plus(tw_link.get('href')))
                    element = _wrap_tag(twitter_frame, 'figure')

    return element
示例#7
0
def convert_html_to_telegraph_format(html_string, clean_html=True):
    if clean_html:
        html_string = clean_article_html(html_string)

    fragments = preprocess_fragments(html.fragments_fromstring(html_string))
    content = []

    for fragment in fragments:

        if fragment.tag not in allowed_top_level_tags:
            paragraph = html.HtmlElement()
            paragraph.tag = 'p'
            paragraph.append(fragment)
            content.append(_recursive_convert(paragraph))
        else:
            content.append(_recursive_convert(fragment))

            # convert and append text nodes after closing tag
            if fragment.tail and len(fragment.tail.strip()) != 0:
                content.append(
                    _recursive_convert(
                        html.fromstring('<p>%s</p>' % fragment.tail)))

    return json.dumps(content, ensure_ascii=False)
示例#8
0
def _wrap_tag(element, wrapper):
    new_element = html.HtmlElement()
    new_element.tag = wrapper
    new_element.append(element)
    return new_element