Пример #1
0
 def test_generate_content_after_element_for_after_pseudoelement(self):
     pseudo = parse_selector('::after', 'pseudo')
     node = P('<div></div>')
     nodes, _ = annotate_pseudo(node, pseudo)
     self.assertEqual(nodes.length, 2)
     self.assertEqual(P(nodes[0]), node)
     self.assertTrue(_element_has_tag(P(nodes[1]), 'mark'))
Пример #2
0
    def test_add_next_sibling_after_selected_sequence(self):

        selector = parse_selector('.klazz + p', 'selector')
        elements = create_content_from_selector(selector)

        # The sibling should be the element right after the element to the left of the combinator
        self.assertEqual(P(elements[0]).attr('class'), 'klazz')
        self.assertTrue(_element_has_tag(P(elements[1]), 'mark'))
        self.assertTrue(_element_has_tag(P(elements[1]).children(), 'p'))
Пример #3
0
def get_pq(path, base=HOST):
    """ Retorna un objeto PyQuery a partir de un path a la página de
    lacuerda. El argumento opcional base indica la URL a la que se le
    agrega el path"""
    try:
        return P(requests.get(urljoin(base, path)).text)
    except requests.exceptions.ConnectionError:
        print 'Conexión rehusada. Reintentando en 20 segundos'
        time.sleep(20)
        return P(requests.get(urljoin(base, path)).text)
Пример #4
0
    def test_generate_text_content_for_text_pseudoelement(self):

        pseudo = parse_selector('::text', 'pseudo')
        node = P('<div></div>')
        node, _ = annotate_pseudo(node, pseudo)

        # This node should have no other children.
        # However, it should have a non-zero amount of HTML (added as text).
        mark = P(node.children('mark'))
        self.assertGreater(len(mark.html()), 0)
        self.assertEqual(len(mark.children()), 0)
Пример #5
0
    def test_generate_content_before_element_for_before_pseudoelement(self):

        pseudo = parse_selector('::before', 'pseudo')
        node = P('<div></div>')
        nodes, _ = annotate_pseudo(node, pseudo)

        # The function for annotation is now going to return a selection with both
        # the original node, and a new node that comes right before it.  This
        # is in contrast to the other functions, which just modify the node passed in.
        self.assertEqual(nodes.length, 2)
        self.assertTrue(_element_has_tag(P(nodes[0]), 'mark'))
        self.assertEqual(P(nodes[1]), node)
Пример #6
0
    def test_add_eventual_sibling_after_selected_sequence(self):

        selector = parse_selector('.klazz ~ p', 'selector')
        elements = create_content_from_selector(selector)

        # For the generalized sibling combinator, we want to make sure that the
        # sibling created is more than one sibling away to prove the point that this can
        # select a sibling farther away than the next one.
        self.assertEqual(P(elements[0]).attr('class'), 'klazz')
        self.assertFalse(_element_has_tag(P(elements[1]), 'p'))
        self.assertTrue(
            _element_has_tag(P(elements[elements.length - 1]), 'mark'))
Пример #7
0
    def render_html_contents(self, contents, indent_level=2):
        '''
        Input is provided as PyQuery object consisting of one or several HTML
        nodes that need to be rendered.  Everything that is surrounded with a
        <mark></mark> tag will instead get surrounded with a
        <span class='tutorons_selection'></span> span.
        The result is a pretty string.
        '''
        # Join the contents into a single chunk of HTML
        html = '\n'.join([P(element).outer_html() for element in contents])

        # BeautifulSoup has a built-in prettifier that will get us most
        # of the way to a presentable document
        # We choose the 'html.parser' instead of the 'html5lib' parser because
        # 1. We should be providing it valid HTML documents (no big need for leniency)
        # 2. The documents are also small, so performance isn't that important right now.
        # 3. 'html5lib' adds unwanted tags to make a document into valid HTML.
        #    We're only interested in showing HTML fragments, not full documents.
        soup_document = BeautifulSoup(html, 'html.parser')
        prettified_text = soup_document.prettify()

        # Apply a number of transformations to escape and indent the
        # text to prepare it for presentation
        stripped_text = prettified_text.strip()
        escaped_text = self._escape(stripped_text)
        marked_text = self._format_marked_content(escaped_text)
        spans_unindented_text = self._unindent_spans(marked_text)
        indented_text = self._indent(spans_unindented_text, indent_level)
        return indented_text
Пример #8
0
def _get_marked_comment_children(node):
    marks = node.children('mark')
    comments = []
    for mark in marks:
        mark_comments = _get_comment_children(P(mark))
        comments.extend(mark_comments)
    return comments
Пример #9
0
 def test_add_generic_comment_to_node_about_unknown_pseudoclass(self):
     pseudo = parse_selector(':unknown', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     comment_children = _get_comment_children(node)
     self.assertEqual(len(comment_children), 1)
     self.assertIn("This element has the pseudoclass 'unknown'",
                   comment_children[0].text)
Пример #10
0
 def test_add_custom_comment_to_node_about_pseudoclass(self):
     pseudo = parse_selector(':checked', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     comment_children = _get_comment_children(node)
     self.assertEqual(len(comment_children), 1)
     self.assertIn("This input has been 'checked'",
                   comment_children[0].text)
Пример #11
0
def append_pseudoelement(element, pseudo_node):

    # Check to see if this pseudoelement is a functional pseudo-element.
    # If it is, and we succeed at generating example behavior for it as a functional
    # pseudo-element, then use this augmentation.
    # Otherwise, proceed to explain it as a generic pseudo-element
    # (See the note in the functional pseudo-element routine about how functional
    # pseudo-elements don't exist in the actual selectors spec).
    if isinstance(pseudo_node.children[2], CssParser.Functional_pseudoContext):
        selection = annotate_functional_pseudoelement(element, pseudo_node)
        if selection is not None:
            return selection

    pseudoelement_name = pseudo_node.children[2].getText()

    if pseudoelement_name == 'first-letter':
        element.append(
            "<mark>The selector chooses the first letter of text in this element</mark>"
        )
        selection = element
    elif pseudoelement_name == 'first-line':
        element.append(
            "<mark>The first line of content displayed for this element gets chosen</mark>\n"
            + "but the second line doesn't.")
        selection = element
    elif pseudoelement_name == 'before':
        selection = P([
            "<mark>This content (generated before a specific element) will be selected</mark>",
            element,
        ])
    elif pseudoelement_name == 'after':
        selection = P([
            element,
            "<mark>This content (generated before a specific element) will be selected</mark>",
        ])
    elif pseudoelement_name == 'text':
        element.append(
            "<mark>The text content of this element will be selected</mark>")
        selection = element
    else:
        element.append("<mark><!--The selector chooses content from the '" +
                       pseudoelement_name +
                       "' pseudo-element of this element--></mark>")
        selection = element

    return selection
Пример #12
0
 def test_highlight_first_line_for_first_line_pseudoelement(self):
     pseudo = parse_selector('::first-line', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     # Text should have been produced that includes:
     # A line at least one character in length that is highlighted
     # Another line at least one character in length that isn't highlighted
     self.assertTrue(
         re.match('<mark>.+</mark>\n.', node.html(), flags=re.MULTILINE))
Пример #13
0
    def test_append_children_to_parents(self):

        selector = parse_selector('.klazz > p', 'selector')
        element = create_content_from_selector(selector)
        self.assertEqual(element.attr('class'), 'klazz')

        # The main element's one and only child should be the paragraph
        self.assertEqual(element.children().length, 1)
        self.assertTrue(_element_has_tag(P(element.children()[0]), 'mark'))
Пример #14
0
 def test_render_single_marked_element(self):
     html = self._render(P('<mark><p></p></mark>'))
     self.assertEqual(
         html, '\n'.join([
             "<span class='tutoron_selection'>",
             "&lt;p&gt;<br>",
             "&lt;/p&gt;<br>",
             "</span>",
         ]))
Пример #15
0
 def test_annotate_node_with_attribute_matching_dash_value(self):
     # This scenario is taken from the example in the W3C selectors documentation:
     # https://www.w3.org/TR/css3-selectors/#attribute-representation
     # Usually, a 'dash match' is used for language subcodes.
     attribute = parse_selector('[href|="en"]', 'attribute')
     node = P('<div></div>')
     node = annotate_attribute(node, attribute)
     self.assertGreater(len(node.attr('href')), 2)
     self.assertTrue(node.attr('href').startswith('en'))
Пример #16
0
 def test_annotate_node_with_attribute_with_substring(self):
     attribute = parse_selector('[href*="url.com"]', 'attribute')
     node = P('<div></div>')
     node = annotate_attribute(node, attribute)
     self.assertGreater(len(node.attr('href')), 7)
     self.assertTrue("url.com" in node.attr('href'))
     self.assertFalse(
         node.attr('href').endswith("url.com")
         or node.attr('href').startswith("url.com"))
Пример #17
0
 def test_comments_added_before_input_elements(self):
     # To my knowledge, PyQuery only generated <input/> elements that cannot have any content.
     # Therefore, any descriptive comments must appear before or after the input
     pseudo = parse_selector(':unknown', 'pseudo')
     node = P('<input></input>')
     node, _ = annotate_pseudo(node, pseudo)
     self.assertEqual(
         node.attr('tip'),
         "This element has the pseudoclass 'unknown'",
     )
Пример #18
0
 def test_add_generic_comment_to_node_about_unknown_functional_pseudoclass(
         self):
     pseudo = parse_selector(':unknown-function(parameter)', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     comment_children = _get_comment_children(node)
     self.assertEqual(len(comment_children), 1)
     self.assertIn(
         "This element satisfies the functional pseudoclass 'unknown-function(parameter)'",
         comment_children[0].text)
Пример #19
0
 def test_generate_generic_comment_to_describe_all_other_pseudoelements(
         self):
     pseudo = parse_selector('::cheese', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     comments = _get_marked_comment_children(node)
     self.assertEqual(len(comments), 1)
     self.assertIn(
         "The selector chooses content from the 'cheese' pseudo-element of this element",
         comments[0].text)
Пример #20
0
 def test_render_head_element(self):
     # Depending on the parser BeautifulSoup may try to insert HTML contents
     # into well-formed <body> or <head> tags.  In a past version of the renderer,
     # tags that appeared in an unexpected part of the document (like 'base' and
     # 'title', which typically appear in the <head>) would not get rendered.
     # This test makes sure that these elements do get rendered.
     element = P('<base></base>')
     html = self._render(element)
     self.assertEqual(html, '\n'.join([
         "&lt;base/&gt;<br>",
     ]))
Пример #21
0
 def test_append_comment_and_attribute_describing_attribute_functional_pseudoelement(
         self):
     pseudo = parse_selector('::attr(href)', 'pseudo')
     node = P('<div></div>')
     node, _ = annotate_pseudo(node, pseudo)
     comments = _get_marked_comment_children(node)
     self.assertEqual(len(comments), 1)
     self.assertIn(
         "The selector chooses the value of this element's 'href' attribute",
         comments[0].text)
     self.assertEqual(node.attr('href'), "<This value is selected>")
Пример #22
0
 def crawl_nimadaili(self,page_count=5):
     '''
     :param page_count:
     :return: yiled 一个一个的代理
     '''
     url_list = ['http://www.nimadaili.com/gaoni/{}'.format(i) for i in range(1, page_count)]
     for url in url_list:
         doc = P(requests.get(url, headers=headers).text)
         items = doc('.fl-table>tbody>tr').items()
         for tr in items:
             proxy = tr.find('td:nth-child(1)').text()
             yield proxy
Пример #23
0
 def test_render_marked_element(self):
     element = P('<div><mark><p></p></mark></div>')
     html = self._render(element)
     self.assertEqual(
         html, '\n'.join([
             "&lt;div&gt;<br>",
             "<span class='tutoron_selection'>",
             "&nbsp;&lt;p&gt;<br>",
             "&nbsp;&lt;/p&gt;<br>",
             "</span>",
             "&lt;/div&gt;<br>",
         ]))
Пример #24
0
 def test_dont_double_escape_chevrons(self):
     # One of the routines that we use to pretty-print HTML documents already
     # escapes the chevrons ('<', '>').  If we escape them twice, then they
     # don't appear as a chevron in the rendered document.
     # We test here to make sure that we don't escape them twice.
     element = P('<a href="<pattern>"></a>')
     html = self._render(element)
     self.assertEqual(
         html, '\n'.join([
             "&lt;a href=\"&lt;pattern&gt;\"&gt;<br>",
             "&lt;/a&gt;<br>",
         ]))
Пример #25
0
 def crawl_kuaidaili(self,page_count=5):
     '''
     :param page_count:
     :return: yiled 一个一个的代理
     '''
     url_list = ['https://www.kuaidaili.com/free/inha/{}/'.format(i) for i in range(1, page_count)]
     for url in url_list:
         doc = P(requests.get(url, headers=headers).text)
         items = doc('tbody>tr').items()
         for tr in items:
             ip = tr.find('td:nth-child(1)').text()
             port = tr.find('td:nth-child(2)').text()
             yield ':'.join([ip, port])
Пример #26
0
def get_artists():
    """ Devuelve una lista con tuplas de Nombre e identificador 
    único del artista"""

    artistas = []
    slugs = []  # Para que no se repitan si aparecen dos veves

    #Obtengo el menú con las iniciales de los artistas
    pq = get_pq('/tabs/')
    for link in pq('#a_menu td a'):
        link = P(link)
        url = link.attr('href')
        letra = link.text()
        if verbose: print 'Explorando la letra', letra

        # Obtengo las URL para cada página de cada letra
        pq_letra = get_pq(url)
        for link in pq_letra('.multipag a'):
            link = P(link)
            texto = link.text()
            url_pag = link.attr('href')
            #if verbose: print '\tExplorando la página', texto
            pq_pagina = get_pq(url_pag, urljoin(HOST, url))
            #Obtengo los artistas de esa página
            for artista in pq_pagina('#i_main li a'):
                artista = P(artista)
                nombre = artista.text()
                nombre = ' '.join(
                    nombre.split()[2:])  # Elimino el Canciones de
                id_artista = artista.attr('href')
                id_artista = id_artista[1:-1]  # Le saco las /
                if not id_artista in slugs:
                    artistas.append((nombre, id_artista))
                slugs.append(id_artista)

    return artistas
Пример #27
0
def _create_element_with_tag(tag_name, namespace=None):

    # Create a verbose tag name with an optional namespace
    namespaced_tag_name = namespace + ':' + tag_name if namespace is not None else tag_name

    # Construct HTML for the open and close of the tag.
    # If there's a namespace for the tag, we need to make sure that there's some
    # attribute pointing the XML parser to the site where the namespace is defined.
    # For the time being, we add that attribute to the element itself (though I
    # expect this is very unlikely to happen in practice).
    html = "<" + namespaced_tag_name
    if namespace is not None:
        html += ' xmlns:' + namespace + "=\"https://namespace-site.com\""
    html += "></" + namespaced_tag_name + ">"

    # One way to support a namespaced tag is to parse it with the 'xml' parser.
    # I don't know if this trick is the only way, but I found out that we could
    # do this by looking through the test code for PyQuery at:
    # https://github.com/gawel/pyquery/blob/master/tests/test_pyquery.py
    return P(html, parser='xml')
Пример #28
0
from pyquery import PyQuery as P
from fake_useragent import UserAgent
import requests
headers = {'User-Agent': UserAgent().random}
# url_list=['https://www.kuaidaili.com/free/inha/{}/'.format(i) for i in range(1,4)]
# for url in url_list:
#     doc = P(requests.get(url, headers=headers).text)
#     items = doc('tbody>tr').items()
#     for tr in items:
#         ip=tr.find('td:nth-child(1)').text()
#         port=tr.find('td:nth-child(2)').text()
#         print(ip,port)

url_list = ['http://www.nimadaili.com/gaoni/{}'.format(i) for i in range(1, 5)]
for url in url_list:
    doc = P(requests.get(url, headers=headers).text)
    items = doc('.fl-table>tbody>tr').items()
    for tr in items:
        ip = tr.find('td:nth-child(1)').text()
        print(ip)
Пример #29
0
 def test_annotate_node_with_attribute_including_value(self):
     attribute = parse_selector('[href~="url.com"]', 'attribute')
     node = P('<div></div>')
     node = annotate_attribute(node, attribute)
     self.assertGreater(len(node.attr('href')), 7)
     self.assertTrue(re.search(r"(^|\b)url\.com(\b|$)", node.attr('href')))
Пример #30
0
 def test_annotate_node_with_attribute_that_ends_with_value(self):
     attribute = parse_selector('[href$="url.com"]', 'attribute')
     node = P('<div></div>')
     node = annotate_attribute(node, attribute)
     self.assertGreater(len(node.attr('href')), 7)
     self.assertTrue(node.attr('href').endswith("url.com"))