Пример #1
0
    def on_loaded(self):
        warn = self.document.xpath('//div[@id="message_renouvellement_mot_passe"]')
        if len(warn) > 0:
            raise BrowserIncorrectPassword(warn[0].text)

        # load content of loading divs.
        divs = []
        for div in self.document.xpath('//div[starts-with(@id, "as_")]'):
            loading = div.xpath('.//span[@class="loading"]')
            if len(loading) == 0:
                continue

            input = div.xpath('.//input')[0]
            divs.append([div, input.attrib['name']])

        if len(divs) > 0:
            args = {}
            for i, (div, name) in enumerate(divs):
                args['key%s' % i] = name
                args['div%s' % i] = div.attrib['id']
            args['time'] = 0
            r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **args))
            data = json.load(r)

            for i, (div, name) in enumerate(divs):
                html = data['data'][i]['flux']
                div.clear()
                div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))
Пример #2
0
    def load_async(self, time):
        total = 0
        restart = True
        while restart:
            restart = False

            # load content of loading divs.
            lst = self.doc.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]')
            if len(lst) > 0:
                params = {}
                for i, input in enumerate(lst):
                    params['key%s' % i] = input.attrib['name']
                    params['div%s' % i] = input.attrib['value']
                params['time'] = time

                r = self.browser.open('/AsynchAjax', params=params)
                data = json.loads(r.content)

                for i, d in enumerate(data['data']):
                    div = self.doc.xpath('//div[@id="%s"]' % d['key'])[0]
                    html = d['flux']
                    div.clear()
                    div.attrib['id'] = d['key'] # needed because clear removes also all attributes
                    div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))

                if 'time' in data:
                    wait = float(data['time'])/1000.0
                    self.logger.debug('should wait %f more seconds', wait)
                    total += wait
                    if total > 120:
                        raise BrowserUnavailable('too long time to wait')

                    sleep(wait)
                    restart = True
Пример #3
0
def query_initial_packages(search_term):
    """
    Perform an initial package search on PyPI with the given :attr:`search_term`, and return a list of
    :attr:`PypiSearchResult` named objects.

    :param str search_term: The initial search query
    :return: The list of search results
    :rtype: list[PypiSearchResult]
    """
    logging.info("Querying initial packages for %s...", search_term)
    result_page = requests.get("https://pypi.python.org/pypi",
                               params={
                                   ":action": "search",
                                   "term": search_term
                               })
    result_tree = etree.fromstring(result_page.content, HTMLParser())
    result_tree.make_links_absolute(result_page.url)
    result_tags = result_tree.xpath("//table[@class='list']/tr[@class][td]")
    results = []
    for lxml_element in result_tags:
        result_obj = PypiJsonSearchResult(link="{0}/json".format(
            lxml_element[0][0].get("href")),
                                          weight=int(lxml_element[1].text),
                                          summary=lxml_element[2].text or '')
        if result_obj.is_pip_result(search_term):
            results.append(result_obj)
    return results
Пример #4
0
    def load_async(self, time):
        total = 0
        restart = True
        while restart:
            restart = False

            # load content of loading divs.
            lst = self.doc.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]')
            if len(lst) > 0:
                params = {}
                for i, input in enumerate(lst):
                    params['key%s' % i] = input.attrib['name']
                    params['div%s' % i] = input.attrib['value']
                params['time'] = time

                r = self.browser.open('/AsynchAjax', params=params)
                data = json.loads(r.content)

                for i, d in enumerate(data['data']):
                    div = self.doc.xpath('//div[@id="%s"]' % d['key'])[0]
                    html = d['flux']
                    div.clear()
                    div.attrib['id'] = d['key'] # needed because clear removes also all attributes
                    div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))

                if 'time' in data:
                    wait = float(data['time'])/1000.0
                    self.logger.debug('should wait %f more seconds', wait)
                    total += wait
                    if total > 120:
                        raise BrowserUnavailable('too long time to wait')

                    sleep(wait)
                    restart = True
Пример #5
0
    def load_async(self, time):
        # load content of loading divs.
        lst = self.document.xpath(
            '//input[@type="hidden" and starts-with(@id, "asynch")]')
        if len(lst) > 0:
            params = {}
            for i, input in enumerate(lst):
                params['key%s' % i] = input.attrib['name']
                params['div%s' % i] = input.attrib['value']
            params['time'] = time

            r = self.browser.openurl(
                self.browser.buildurl('/AsynchAjax', **params))
            data = json.load(r)

            for i, d in enumerate(data['data']):
                div = self.document.xpath('//div[@id="%s"]' % d['key'])[0]
                html = d['flux']
                div.clear()
                div.attrib['id'] = d[
                    'key']  # needed because clear removes also all attributes
                div.insert(0, etree.fromstring(html,
                                               parser=etree.HTMLParser()))

            if 'time' in data:
                sleep(float(data['time']) / 1000.0)
                return self.load_async(time)
 def _string_is_valid_html(self, string):
     try:
         a = ETH.fromstring(string).find('.//*') is not None
         self.logger.debug('_string_is_valid_html: The string is recognized as HTML')
     except ET.XMLSyntaxError as e:
         self.logger.debug('_string_is_valid_html: The string is NOT recognized as HTML')
         a = False
     return a
Пример #7
0
def fetch_winpython_lib_page():
    """
    Fetch the Windows Python compiled libraries page and return the parsed element tree.
    """
    resp = requests.get(WINPYTHON_LIBS_URL, timeout=30)
    tree = etree.fromstring(resp.content, HTMLParser())
    tree.make_links_absolute(resp.url)
    return tree
Пример #8
0
 def TO_HTML(cls, r, c, x):
     '''Gets the HTML tree from an HTML string.
             
             Args:
                 x   (str    ): The html source.
             Returns:
                 lxml.html.HtmlElement   : HTML element.
         '''
     return etree.fromstring(x)
Пример #9
0
    def test_widget_attrs(self):
        widget = self.widget_class('FkModelAutocomplete',
            widget_attrs={'data-widget-foo': 'bar', 'class':'foobar'})
        html = widget.render('somewidget', None)
        et = etree.fromstring(html)

        self.assertEquals(et.attrib['data-widget-foo'], 'bar')
        self.assertIn('foobar', et.attrib['class'])
        self.assertIn('autocomplete-light-widget', et.attrib['class'])
    def test_value_out_of_queryset(self):
        widget = self.widget_class('ItemAutocomplete')
        html = widget.render('somewidget', [1, 2])
        span = etree.fromstring(html)

        choices = CSSSelector('[data-value]')(span)

        self.assertEqual(len(choices), 1)
        self.assertEqual(int(choices[0].attrib['data-value']), 1)
Пример #11
0
    def test_value_out_of_queryset(self):
        widget = self.widget_class('ItemAutocomplete')
        html = widget.render('somewidget', [1, 2])
        span = etree.fromstring(html)

        choices = CSSSelector('[data-value]')(span)

        self.assertEqual(len(choices), 1)
        self.assertEqual(int(choices[0].attrib['data-value']), 1)
Пример #12
0
def get_params():
    '''get form parameters for session use'''
    r = _session.get(URL, headers={'User-Agent': _UA})
    tree = etree.fromstring(r.text, etree.HTMLParser())
    # Get all input tags
    params = {
        x.attrib['name']: x.attrib.get('value', '')
        for x in tree.xpath('.//input')
    }
    return r.text, params
Пример #13
0
def get_links(html):
    parser = etree.HTMLParser()

    try:
        tree = etree.fromstring(html, parser=parser)
    except XMLSyntaxError as ex:
        return []

    if tree is None:
        return []
    links = tree.xpath('//a/@href')
    return links
Пример #14
0
def process_page(sterile_page, target_url):
    """
    Process the page so all the links has it's text wrapped in <em></em> and all the words that are longer than 4
    symbols are wrapped in <strong></strong>
    :param sterile_page: A string, target page's source stripped from all the tags, but <a></a>
    :param target_url: A string, an URL which user gave us
    :return: A string, processed page ready to render in template
    """
    # Parse the inbound page into element tree with lxml
    root = etree.fromstring(sterile_page)

    # First, let's deal with <a></a>
    for a_tag in root.xpath(".//a"):

        # If <a></a> has some text in it
        if a_tag.text and a_tag.text.strip():

            # Create new element <em></em>, assign the text from <a></a> to it, delete the text from <a></a>,
            # and insert <em></em> element instead
            em = etree.Element('em')
            em.text = a_tag.text
            a_tag.text = None
            a_tag.insert(0, em)

            # While we are at it, let's fix all the broken relative links we got from page source
            # #crutch_alert
            try:
                # If it works, we don't need to do anything with the a_tag's href
                valid = URLValidator()
                valid(a_tag.attrib['href'])

            except ValidationError:
                # Good chances are, that this malformed url is _relative_ to target url's domain
                a_tag.attrib['href'] = absolutize_url(
                    schemeful_domain(target_url), a_tag.attrib['href'])

        else:
            # If <a></a> is empty (e.g., after removing an image from anchor's text), remove it altogether with hrefs.
            a_tag.getparent().remove(a_tag)

    # Take every element in the tree and traverse the tree, checking if it has text in it
    # If it does, inflict reinforce_text() which will wrap the words in <strong></strong> if they are longer than 4
    for element in root.iter():

        if element.text and element.text.strip():
            element.text = reinforce_text(element.text)

        if element.tail and element.tail.strip():
            element.tail = reinforce_text(element.tail)

    # The final bit: flatten the modified tree back to string, decode it and then unescape everything what was escaped
    # (< and > in <strong></strong>)
    return unescape(etree.tostring(root, method='html').decode())
    def test_widget_attrs(self):
        widget = self.widget_class("FkModelAutocomplete", widget_attrs={"class": "foo"})
        html = widget.render("somewidget", None)
        et = etree.XML(html)
        self.assertIn("foo", et.attrib["class"])

        # This was originally masked from the test suite because method
        # definition was repeated
        widget = self.widget_class("FkModelAutocomplete", widget_attrs={"data-widget-foo": "bar", "class": "foobar"})
        html = widget.render("somewidget", None)
        et = etree.fromstring(html)
        self.assertEquals(et.attrib["data-widget-foo"], "bar")
        self.assertIn("foobar", et.attrib["class"])
        self.assertIn("autocomplete-light-widget", et.attrib["class"])
Пример #16
0
def get_links(html):
    parser = etree.HTMLParser()
    
    try:
        tree = etree.fromstring(html, parser=parser)
    except XMLSyntaxError as ex:
        log.warn('html parsing error')
        return []

    if tree is None:
        log.warn("html not parsed")
        return []
    links = tree.xpath('//a/@href')
    return links
Пример #17
0
 def open_in_browser(self):
     if sys.platform == "win32":
         try:
             td_td_html = self.to_html()
             if not td_td_html:
                 return
             table = """<table border="1" cellspacing="0">{}</table>""".format(
                 td_td_html)
             open_in_browser(etree.fromstring(table))
         except:
             print("open_in_browser ERROR!")
         else:
             print("open in browser success!")
     else:
         print("{}不支持此功能".format(sys.platform))
Пример #18
0
    def export_schedule(self, out_file=None):
        e_html = etree.Element('html')
        e_head = etree.SubElement(e_html, 'head')

        e_encoding = etree.SubElement(e_head, 'meta', charset="utf-8")

        if self.options.get('html_title', False):
            title = self.options['html_title']
        else:
            title = self.schedule.name

        e_title = etree.SubElement(e_head, 'title')
        e_title.text = title

        e_style = etree.SubElement(e_head, 'style', type='text/css')
        e_style.text = css

        e_body = etree.SubElement(e_html, 'body')

        e_h1 = etree.SubElement(e_body, 'h1')
        e_h1.text = title

        if self.options.get('html_table_header', False):
            e_body.append(etree.fromstring(self.options['html_table_header']))

        e_table = etree.SubElement(e_body,
                                   'table',
                                   attrib={
                                       'align': 'center',
                                       'class': 'schedule'
                                   })
        e_tr_head = etree.SubElement(e_table, 'tr')
        head_columns = ['HierarchIndex', 'Name', 'Start', 'End', 'Duration']
        for column in head_columns:
            e_th_head = etree.SubElement(e_tr_head, 'th')
            e_th_head.text = column

        for index, task in enumerate(self.schedule.tasks):
            self._export_task(e_table, task, index + 1)

        etree_return = etree.ElementTree(e_html)
        if out_file:
            etree_return.write(out_file,
                               pretty_print=True,
                               encoding="utf-8",
                               xml_declaration=False)

        return str(etree_return)
    def test_widget_attrs(self):
        widget = self.widget_class('FkModelAutocomplete',
                                   widget_attrs={'class': 'foo'})
        html = widget.render('somewidget', None)
        et = etree.XML(html)
        self.assertIn('foo', et.attrib['class'])

        # This was originally masked from the test suite because method
        # definition was repeated
        widget = self.widget_class('FkModelAutocomplete',
            widget_attrs={'data-widget-foo': 'bar', 'class':'foobar'})
        html = widget.render('somewidget', None)
        et = etree.fromstring(html)
        self.assertEquals(et.attrib['data-widget-foo'], 'bar')
        self.assertIn('foobar', et.attrib['class'])
        self.assertIn('autocomplete-light-widget', et.attrib['class'])
Пример #20
0
    def post_filter(self, args):
       
        title = args[0].split('[[')[-1].split(']]')[0].split('|')[-1]
        if title.strip():
            title = title.strip()

        text = args[1]
        counts = {}
        doc = etree.fromstring(text, etree.HTMLParser())
        hids = []            
        toc_html = '<div id="toc" class="table_of_contents"><h3>%s</h3>\n'%(title)
        for node in doc.xpath('//h1|//h2|//h3|//h4|//h5'):
            if node.tag.lower() == 'h1':
                this_depth = 0
            elif node.tag.lower() == 'h2':
                this_depth = 1
            elif node.tag.lower() == 'h3':
                this_depth = 2
            elif node.tag.lower() == 'h4':
                this_depth = 3
            elif node.tag.lower() == 'h5':
                this_depth = 4
            else:
                continue
            
            p = re.compile('[^a-zA-Z0-9\s\_]')
            this_id = re.sub(p, '-', node.text).replace(' ','-')
            if this_id in hids:
                counts[this_id] = counts.get(this_id, 0) + 1
                this_id = '%s-%s'%(this_id, counts[this_id])
            hids.append(this_id)           
            
            pat = '%s'%(etree.tostring(node))
            rep = '<%s id="%s" class="toc_heading">%s'\
                  '<span class="toc_top"><a href="#toc">&#8617;</a></span></%s>'\
                  '<p style="clear: both;"></p>'\
                  %(node.tag, this_id, node.text, node.tag)
            text = text.replace(pat, rep, 1)
            indent_px = this_depth * 20
            toc_html += '<p style="margin-left: %spx">+ '\
                        '<a href="#%s">%s</a></p>\n'\
                        %(indent_px, this_id, node.text)                

        toc_html += '</div>\n'
        text = text.replace(text,toc_html+text)
        return text
Пример #21
0
def from_youdao(query: str):
    try:
        html = urlopen("https://www.youdao.com/w/eng/%s/#keyfrom=dict2.index" %
                       (quote(query), ))
        bs_obj = BeautifulSoup(html, 'html.parser')
        html_str = bs_obj.prettify()
        html_str = re.split("(</html>)", html_str)
        html_str = html_str[0] + html_str[1]
        root = et.fromstring(html_str)
        ns = {"default": root.nsmap[None]}
        xpath = ".//*[contains(@id, 'phrsListTab')]//*[contains(@class, 'container')]//default:li//text()"
        li_text = root.xpath(xpath, namespaces=ns)
        meaning = [utils_text_preprocess.clean_text(li) for li in li_text]
    except BaseException as e:
        print(e)
        meaning = []
    return meaning
Пример #22
0
def main(args=None):
    """
    Get the IP of the machine calling this function.

    :return str: The IP of the caller
    """
    resp = requests.get("http://www.ip-details.com", timeout=5)
    resp.raise_for_status()
    tree = etree.fromstring(resp.content, HTMLParser())
    tree.make_links_absolute(resp.url)
    ipAddrText = tree.xpath("//div/h1[@class]/text()")
    try:
        return ipAddrText[0].split(":")[-1].strip()
    except Exception as e:
        print >> sys.stderr, "Error parsing URL content at {0!r}".format(
            resp.url)
        raise e
Пример #23
0
 def apply_update(self, page_content):
     """
     From the given page content, parse and add the download statistics to this search result.
     """
     tree = etree.fromstring(page_content, HTMLParser())
     counts = tree.xpath(
         "//ul[@class='nodot'][li[strong[starts-with(text(), 'Downloads')]]]/li/span/text()"
     )
     self.download_counts = [float(count) for count in counts]
     last_update = tree.xpath(
         "//table[@class='list']/tr[@class]/td[4]/text()")
     if last_update not in [None, []]:
         self.last_update = dateutil.parser.parse(last_update[0],
                                                  ignoretz=True)
         return True
     self.last_update = None
     return False
    def set_node_note_by_id(self, node_id, note_text):
        """

        :param node_id:
        :param note_text:
        :return:
        """

        node = self.get_node_by_id(node_id)

        if node is None:
            raise self.FreeplaneNodeNotExisting
        else:
            if self.get_node_note_by_id(node_id) is None:
                self.logger.debug('set_node_note_by_id: No Note find under {0}.  Creating one now...'.format(node_id))
            else:
                self.logger.debug('set_node_note_by_id: Note exist under {0} and will override it'.format(node_id))
                richcontent_node = node.find(self.T_RICHCONTENT)
                node.remove(richcontent_node)
                del richcontent_node

            richcontent_node = ET.SubElement(node, self.T_RICHCONTENT)
            richcontent_node.set(self.A_TYPE, self.V_TYPE_NOTE)

            if self._string_is_valid_html(note_text):
                local_html_doc = ETH.fromstring(note_text)
            else:

                # Raw text is crashing freeplane.  Will try to wrap the note in an HTML document

                local_html_doc = ET.Element('html')
                head = ET.SubElement(local_html_doc, 'head')
                body = ET.SubElement(local_html_doc, 'body')

                # Sanitize: Remove rogue bracket < >
                note_text = note_text.replace('&', '&amp;')
                note_text = note_text.replace('<', '&lt;')
                note_text = note_text.replace('>', '&gt;')

                data = '<p>%s</p>' % note_text.replace('\n', '<br />')

                p = ET.fromstring(data)
                body.append(p)

            richcontent_node.insert(1, local_html_doc)
Пример #25
0
    def filter(
        self,
        html: str,
        inline: bool = False,
        outgoing: bool = False,
        display_name_mentions: Optional[Dict[str, str]] = None,
    ) -> str:
        """Filter and return HTML."""

        mentions = display_name_mentions

        sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
        html = sanit.sanitize(html).rstrip("\n")

        if not html.strip():
            return html

        tree = etree.fromstring(
            html,
            parser=etree.HTMLParser(encoding="utf-8"),
        )

        for a_tag in tree.iterdescendants("a"):
            self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)

            if not outgoing:
                self._matrix_to_links_add_classes(a_tag)

        html = etree.tostring(tree, encoding="utf-8", method="html").decode()
        html = sanit.sanitize(html).rstrip("\n")

        if outgoing:
            return html

        # Client-side modifications

        html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)

        if not inline:
            return html

        return self.inline_quote_regex.sub(
            r'\1<span class="quote">\2</span>',
            html,
        )
Пример #26
0
    def add_latest_date_from_ftp_page(self, page_content):
        """
        From the given page content, parse and add the latest date listed.
        """
        tree = etree.fromstring(page_content, HTMLParser())
        xpath_arg = "//a[@href][starts-with(., '{0}')]".format(self.name)
        link_elems = tree.xpath(xpath_arg)
        max_date = datetime.min
        for elem in link_elems:
            date_size_parts = (elem.tail or "").strip().split()
            if not date_size_parts:
                continue
            date_str = " ".join(date_size_parts[:-1])
            date_val = dateutil.parser.parse(date_str, ignoretz=True)

            # If parser returns default date, it's most likely an error, so skip over it.
            default_date = datetime.combine(datetime.now().date(), dt_time.min)
            if date_val == default_date:
                continue
            max_date = max(date_val, max_date)
        self.last_update = max_date
Пример #27
0
    def load_async(self, time):
        # load content of loading divs.
        lst = self.document.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]')
        if len(lst) > 0:
            params = {}
            for i, input in enumerate(lst):
                params['key%s' % i] = input.attrib['name']
                params['div%s' % i] = input.attrib['value']
            params['time'] = time

            r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **params))
            data = json.load(r)

            for i, d in enumerate(data['data']):
                div = self.document.xpath('//div[@id="%s"]' % d['key'])[0]
                html = d['flux']
                div.clear()
                div.attrib['id'] = d['key'] # needed because clear removes also all attributes
                div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))

            if 'time' in data:
                sleep(float(data['time'])/1000.0)
                return self.load_async(time)
Пример #28
0
 def search_rpm_page(self, page=1):
     url = 'http://rpm.pbone.net/index.php3'
     cookie_dict = {
         'cookie_lang': '2',
         'cookie_srodzaj': '4',
         'cookie_dl': '100',
         'cookie_simple': '1',
         'cookies_accepted': 'T'
     }
     post_data = {
         'stat': 3,
         'search': self.search_term,
         'simple': 1,
         'srodzaj': 4,
         'limit': page
     }
     with TimeoutContext(5):
         resp = self.session.post(url,
                                  data=post_data,
                                  cookies=cookie_dict,
                                  timeout=(5, 21))
     tree = etree.fromstring(resp.content, HTMLParser())
     tree.make_links_absolute(resp.url)
     return tree
Пример #29
0
def parse_index(html):
    # fromstring Parses an XML document or fragment from a string.
    doc = etree.fromstring(html)
    pid = doc.xpath('//ROW/@PTID')
    return pid
Пример #30
0
import requests
from lxml.html import etree
site_body = requests.get('http://afisha.yandex.by/minsk')
html = """\
analysis = [site_body.text][0]
root = etree.fromstring(html)
print(root.xpath('//span[@class="card_info_inner"]')[0].text)
print(analysis)
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
from lxml.html import etree as et

if __name__ == "__main__":
    html = urlopen("https://www.youdao.com/w/eng/saw/#keyfrom=dict2.index")
    bs_obj = BeautifulSoup(html, 'html.parser')
    html_str = bs_obj.prettify()
    root = et.fromstring(html_str)