def test_generate_content_after_element_for_after_pseudoelement(self): pseudo = parse_selector('::after', 'pseudo') node = P('<div></div>') nodes, _ = annotate_pseudo(node, pseudo) self.assertEqual(nodes.length, 2) self.assertEqual(P(nodes[0]), node) self.assertTrue(_element_has_tag(P(nodes[1]), 'mark'))
def test_add_next_sibling_after_selected_sequence(self): selector = parse_selector('.klazz + p', 'selector') elements = create_content_from_selector(selector) # The sibling should be the element right after the element to the left of the combinator self.assertEqual(P(elements[0]).attr('class'), 'klazz') self.assertTrue(_element_has_tag(P(elements[1]), 'mark')) self.assertTrue(_element_has_tag(P(elements[1]).children(), 'p'))
def get_pq(path, base=HOST): """ Retorna un objeto PyQuery a partir de un path a la página de lacuerda. El argumento opcional base indica la URL a la que se le agrega el path""" try: return P(requests.get(urljoin(base, path)).text) except requests.exceptions.ConnectionError: print 'Conexión rehusada. Reintentando en 20 segundos' time.sleep(20) return P(requests.get(urljoin(base, path)).text)
def test_generate_text_content_for_text_pseudoelement(self): pseudo = parse_selector('::text', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) # This node should have no other children. # However, it should have a non-zero amount of HTML (added as text). mark = P(node.children('mark')) self.assertGreater(len(mark.html()), 0) self.assertEqual(len(mark.children()), 0)
def test_generate_content_before_element_for_before_pseudoelement(self): pseudo = parse_selector('::before', 'pseudo') node = P('<div></div>') nodes, _ = annotate_pseudo(node, pseudo) # The function for annotation is now going to return a selection with both # the original node, and a new node that comes right before it. This # is in contrast to the other functions, which just modify the node passed in. self.assertEqual(nodes.length, 2) self.assertTrue(_element_has_tag(P(nodes[0]), 'mark')) self.assertEqual(P(nodes[1]), node)
def test_add_eventual_sibling_after_selected_sequence(self): selector = parse_selector('.klazz ~ p', 'selector') elements = create_content_from_selector(selector) # For the generalized sibling combinator, we want to make sure that the # sibling created is more than one sibling away to prove the point that this can # select a sibling farther away than the next one. self.assertEqual(P(elements[0]).attr('class'), 'klazz') self.assertFalse(_element_has_tag(P(elements[1]), 'p')) self.assertTrue( _element_has_tag(P(elements[elements.length - 1]), 'mark'))
def render_html_contents(self, contents, indent_level=2): ''' Input is provided as PyQuery object consisting of one or several HTML nodes that need to be rendered. Everything that is surrounded with a <mark></mark> tag will instead get surrounded with a <span class='tutorons_selection'></span> span. The result is a pretty string. ''' # Join the contents into a single chunk of HTML html = '\n'.join([P(element).outer_html() for element in contents]) # BeautifulSoup has a built-in prettifier that will get us most # of the way to a presentable document # We choose the 'html.parser' instead of the 'html5lib' parser because # 1. We should be providing it valid HTML documents (no big need for leniency) # 2. The documents are also small, so performance isn't that important right now. # 3. 'html5lib' adds unwanted tags to make a document into valid HTML. # We're only interested in showing HTML fragments, not full documents. soup_document = BeautifulSoup(html, 'html.parser') prettified_text = soup_document.prettify() # Apply a number of transformations to escape and indent the # text to prepare it for presentation stripped_text = prettified_text.strip() escaped_text = self._escape(stripped_text) marked_text = self._format_marked_content(escaped_text) spans_unindented_text = self._unindent_spans(marked_text) indented_text = self._indent(spans_unindented_text, indent_level) return indented_text
def _get_marked_comment_children(node): marks = node.children('mark') comments = [] for mark in marks: mark_comments = _get_comment_children(P(mark)) comments.extend(mark_comments) return comments
def test_add_generic_comment_to_node_about_unknown_pseudoclass(self): pseudo = parse_selector(':unknown', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) comment_children = _get_comment_children(node) self.assertEqual(len(comment_children), 1) self.assertIn("This element has the pseudoclass 'unknown'", comment_children[0].text)
def test_add_custom_comment_to_node_about_pseudoclass(self): pseudo = parse_selector(':checked', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) comment_children = _get_comment_children(node) self.assertEqual(len(comment_children), 1) self.assertIn("This input has been 'checked'", comment_children[0].text)
def append_pseudoelement(element, pseudo_node): # Check to see if this pseudoelement is a functional pseudo-element. # If it is, and we succeed at generating example behavior for it as a functional # pseudo-element, then use this augmentation. # Otherwise, proceed to explain it as a generic pseudo-element # (See the note in the functional pseudo-element routine about how functional # pseudo-elements don't exist in the actual selectors spec). if isinstance(pseudo_node.children[2], CssParser.Functional_pseudoContext): selection = annotate_functional_pseudoelement(element, pseudo_node) if selection is not None: return selection pseudoelement_name = pseudo_node.children[2].getText() if pseudoelement_name == 'first-letter': element.append( "<mark>The selector chooses the first letter of text in this element</mark>" ) selection = element elif pseudoelement_name == 'first-line': element.append( "<mark>The first line of content displayed for this element gets chosen</mark>\n" + "but the second line doesn't.") selection = element elif pseudoelement_name == 'before': selection = P([ "<mark>This content (generated before a specific element) will be selected</mark>", element, ]) elif pseudoelement_name == 'after': selection = P([ element, "<mark>This content (generated before a specific element) will be selected</mark>", ]) elif pseudoelement_name == 'text': element.append( "<mark>The text content of this element will be selected</mark>") selection = element else: element.append("<mark><!--The selector chooses content from the '" + pseudoelement_name + "' pseudo-element of this element--></mark>") selection = element return selection
def test_highlight_first_line_for_first_line_pseudoelement(self): pseudo = parse_selector('::first-line', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) # Text should have been produced that includes: # A line at least one character in length that is highlighted # Another line at least one character in length that isn't highlighted self.assertTrue( re.match('<mark>.+</mark>\n.', node.html(), flags=re.MULTILINE))
def test_append_children_to_parents(self): selector = parse_selector('.klazz > p', 'selector') element = create_content_from_selector(selector) self.assertEqual(element.attr('class'), 'klazz') # The main element's one and only child should be the paragraph self.assertEqual(element.children().length, 1) self.assertTrue(_element_has_tag(P(element.children()[0]), 'mark'))
def test_render_single_marked_element(self): html = self._render(P('<mark><p></p></mark>')) self.assertEqual( html, '\n'.join([ "<span class='tutoron_selection'>", "<p><br>", "</p><br>", "</span>", ]))
def test_annotate_node_with_attribute_matching_dash_value(self): # This scenario is taken from the example in the W3C selectors documentation: # https://www.w3.org/TR/css3-selectors/#attribute-representation # Usually, a 'dash match' is used for language subcodes. attribute = parse_selector('[href|="en"]', 'attribute') node = P('<div></div>') node = annotate_attribute(node, attribute) self.assertGreater(len(node.attr('href')), 2) self.assertTrue(node.attr('href').startswith('en'))
def test_annotate_node_with_attribute_with_substring(self): attribute = parse_selector('[href*="url.com"]', 'attribute') node = P('<div></div>') node = annotate_attribute(node, attribute) self.assertGreater(len(node.attr('href')), 7) self.assertTrue("url.com" in node.attr('href')) self.assertFalse( node.attr('href').endswith("url.com") or node.attr('href').startswith("url.com"))
def test_comments_added_before_input_elements(self): # To my knowledge, PyQuery only generated <input/> elements that cannot have any content. # Therefore, any descriptive comments must appear before or after the input pseudo = parse_selector(':unknown', 'pseudo') node = P('<input></input>') node, _ = annotate_pseudo(node, pseudo) self.assertEqual( node.attr('tip'), "This element has the pseudoclass 'unknown'", )
def test_add_generic_comment_to_node_about_unknown_functional_pseudoclass( self): pseudo = parse_selector(':unknown-function(parameter)', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) comment_children = _get_comment_children(node) self.assertEqual(len(comment_children), 1) self.assertIn( "This element satisfies the functional pseudoclass 'unknown-function(parameter)'", comment_children[0].text)
def test_generate_generic_comment_to_describe_all_other_pseudoelements( self): pseudo = parse_selector('::cheese', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) comments = _get_marked_comment_children(node) self.assertEqual(len(comments), 1) self.assertIn( "The selector chooses content from the 'cheese' pseudo-element of this element", comments[0].text)
def test_render_head_element(self): # Depending on the parser BeautifulSoup may try to insert HTML contents # into well-formed <body> or <head> tags. In a past version of the renderer, # tags that appeared in an unexpected part of the document (like 'base' and # 'title', which typically appear in the <head>) would not get rendered. # This test makes sure that these elements do get rendered. element = P('<base></base>') html = self._render(element) self.assertEqual(html, '\n'.join([ "<base/><br>", ]))
def test_append_comment_and_attribute_describing_attribute_functional_pseudoelement( self): pseudo = parse_selector('::attr(href)', 'pseudo') node = P('<div></div>') node, _ = annotate_pseudo(node, pseudo) comments = _get_marked_comment_children(node) self.assertEqual(len(comments), 1) self.assertIn( "The selector chooses the value of this element's 'href' attribute", comments[0].text) self.assertEqual(node.attr('href'), "<This value is selected>")
def crawl_nimadaili(self,page_count=5): ''' :param page_count: :return: yiled 一个一个的代理 ''' url_list = ['http://www.nimadaili.com/gaoni/{}'.format(i) for i in range(1, page_count)] for url in url_list: doc = P(requests.get(url, headers=headers).text) items = doc('.fl-table>tbody>tr').items() for tr in items: proxy = tr.find('td:nth-child(1)').text() yield proxy
def test_render_marked_element(self): element = P('<div><mark><p></p></mark></div>') html = self._render(element) self.assertEqual( html, '\n'.join([ "<div><br>", "<span class='tutoron_selection'>", " <p><br>", " </p><br>", "</span>", "</div><br>", ]))
def test_dont_double_escape_chevrons(self): # One of the routines that we use to pretty-print HTML documents already # escapes the chevrons ('<', '>'). If we escape them twice, then they # don't appear as a chevron in the rendered document. # We test here to make sure that we don't escape them twice. element = P('<a href="<pattern>"></a>') html = self._render(element) self.assertEqual( html, '\n'.join([ "<a href=\"<pattern>\"><br>", "</a><br>", ]))
def crawl_kuaidaili(self,page_count=5): ''' :param page_count: :return: yiled 一个一个的代理 ''' url_list = ['https://www.kuaidaili.com/free/inha/{}/'.format(i) for i in range(1, page_count)] for url in url_list: doc = P(requests.get(url, headers=headers).text) items = doc('tbody>tr').items() for tr in items: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def get_artists(): """ Devuelve una lista con tuplas de Nombre e identificador único del artista""" artistas = [] slugs = [] # Para que no se repitan si aparecen dos veves #Obtengo el menú con las iniciales de los artistas pq = get_pq('/tabs/') for link in pq('#a_menu td a'): link = P(link) url = link.attr('href') letra = link.text() if verbose: print 'Explorando la letra', letra # Obtengo las URL para cada página de cada letra pq_letra = get_pq(url) for link in pq_letra('.multipag a'): link = P(link) texto = link.text() url_pag = link.attr('href') #if verbose: print '\tExplorando la página', texto pq_pagina = get_pq(url_pag, urljoin(HOST, url)) #Obtengo los artistas de esa página for artista in pq_pagina('#i_main li a'): artista = P(artista) nombre = artista.text() nombre = ' '.join( nombre.split()[2:]) # Elimino el Canciones de id_artista = artista.attr('href') id_artista = id_artista[1:-1] # Le saco las / if not id_artista in slugs: artistas.append((nombre, id_artista)) slugs.append(id_artista) return artistas
def _create_element_with_tag(tag_name, namespace=None): # Create a verbose tag name with an optional namespace namespaced_tag_name = namespace + ':' + tag_name if namespace is not None else tag_name # Construct HTML for the open and close of the tag. # If there's a namespace for the tag, we need to make sure that there's some # attribute pointing the XML parser to the site where the namespace is defined. # For the time being, we add that attribute to the element itself (though I # expect this is very unlikely to happen in practice). html = "<" + namespaced_tag_name if namespace is not None: html += ' xmlns:' + namespace + "=\"https://namespace-site.com\"" html += "></" + namespaced_tag_name + ">" # One way to support a namespaced tag is to parse it with the 'xml' parser. # I don't know if this trick is the only way, but I found out that we could # do this by looking through the test code for PyQuery at: # https://github.com/gawel/pyquery/blob/master/tests/test_pyquery.py return P(html, parser='xml')
from pyquery import PyQuery as P from fake_useragent import UserAgent import requests headers = {'User-Agent': UserAgent().random} # url_list=['https://www.kuaidaili.com/free/inha/{}/'.format(i) for i in range(1,4)] # for url in url_list: # doc = P(requests.get(url, headers=headers).text) # items = doc('tbody>tr').items() # for tr in items: # ip=tr.find('td:nth-child(1)').text() # port=tr.find('td:nth-child(2)').text() # print(ip,port) url_list = ['http://www.nimadaili.com/gaoni/{}'.format(i) for i in range(1, 5)] for url in url_list: doc = P(requests.get(url, headers=headers).text) items = doc('.fl-table>tbody>tr').items() for tr in items: ip = tr.find('td:nth-child(1)').text() print(ip)
def test_annotate_node_with_attribute_including_value(self): attribute = parse_selector('[href~="url.com"]', 'attribute') node = P('<div></div>') node = annotate_attribute(node, attribute) self.assertGreater(len(node.attr('href')), 7) self.assertTrue(re.search(r"(^|\b)url\.com(\b|$)", node.attr('href')))
def test_annotate_node_with_attribute_that_ends_with_value(self): attribute = parse_selector('[href$="url.com"]', 'attribute') node = P('<div></div>') node = annotate_attribute(node, attribute) self.assertGreater(len(node.attr('href')), 7) self.assertTrue(node.attr('href').endswith("url.com"))