Exemplo n.º 1
0
 def test_process_row(self):
     fragment = html.fragment_fromstring("""
     <tr>
         <td>One</td>
         <td>Two</td>
         <td>Three</td>
     </tr>
     """)
     user_is_anonymous = html.fragment_fromstring("""
     <tr>
         <td class='vertTh'><center>some text</center></td>
         <td>
             <div class='detName'><a href='/torrent/some_torrent' class='detLink'>some torrent</a></div>
             <a href="..." title='Download this torrent'>...</a>
             <a href="..." title='Download this torrent using magnet'>...</a>
             <a href="/user/<some_user>" title="Browse <some_user>">some_user</a>
             <img src="..." />
             <font class='detDesc'>
                 Uploaded 08-10&nbsp;15:57, Size 1.03&nbsp;GiB, ULed by
             </font>
         </td>
         <td align='right'>123</td>
         <td align='right'>321</td>
     </tr>
     """)
     p1, p2 = self._create_parsers(fragment)
     self.assert_(p2.process_row(fragment) == None)
     self.assertRaises(exceptions.InvalidRow, p1.process_row, fragment)
     self.assertEqual(p1.process_row(user_is_anonymous)["user"], "Anonymous")
     self.assertEqual(p2.process_row(user_is_anonymous)["user"], "Anonymous")
Exemplo n.º 2
0
    def transform_misused_divs_into_paragraphs(self):
        for elem in self.tags(self.html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
            #FIXME: The current implementation ignores all descendants that
            # are not direct children of elem
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
                    str_(b''.join(map(tostring, list(elem))))):
                #log.debug("Altering %s to p" % (describe(elem)))
                elem.tag = "p"
                #print "Fixed element "+describe(elem)

        for elem in self.tags(self.html, 'div'):
            if elem.text and elem.text.strip():
                p = fragment_fromstring('<p/>')
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)
                #print "Appended "+tounicode(p)+" to "+describe(elem)

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
                    p = fragment_fromstring('<p/>')
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)
                    #print "Inserted "+tounicode(p)+" to "+describe(elem)
                if child.tag == 'br':
                    #print 'Dropped <br> at '+describe(elem)
                    child.drop_tree()
Exemplo n.º 3
0
	def transform_misused_divs_into_paragraphs(self):
		for elem in self.tags(self.html, 'div'):
			# transform <div>s that do not contain other block elements into <p>s
			if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
				#self.debug("Altering %s to p" % (describe(elem)))
				elem.tag = "p"
				#print "Fixed element "+describe(elem)
				
		for elem in self.tags(self.html, 'div'):
			if elem.text and elem.text.strip():
				p = fragment_fromstring('<p/>')
				p.text = elem.text
				elem.text = None
				elem.insert(0, p)
				#print "Appended "+tounicode(p)+" to "+describe(elem)
			
			for pos, child in reversed(list(enumerate(elem))):
				if child.tail and child.tail.strip():
					p = fragment_fromstring('<p/>')
					p.text = child.tail
					child.tail = None
					elem.insert(pos + 1, p)
					#print "Inserted "+tounicode(p)+" to "+describe(elem)
				if child.tag == 'br':
					#print 'Dropped <br> at '+describe(elem) 
					child.drop_tree()
Exemplo n.º 4
0
    def get_lyrics(self):
        element = self.element

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding=unicode)
        cleaned_html = clean_html(real_string)

        # -KMS Modification-
        # Add try/except block to prevent script from crashing when
        # run from applescript
        try:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        except UnicodeError:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        return 0
Exemplo n.º 5
0
    def transform_misused_divs_into_paragraphs(self):
        """
        Transforms <div> without other block elements into <p>, merges near-standing <p> together.
        """
        for elem in self.tags(self._html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
            # FIXME: The current implementation ignores all descendants that are not direct children of elem
            # This results in incorrect results in case there is an <img> buried within an <a> for example

            if not REGEXES['divToPElementsRe'].search(tostring(elem).decode()):
                elem.tag = "p"

        for elem in self.tags(self._html, 'div'):
            if elem.text and elem.text.strip():
                p = fragment_fromstring('<p/>')
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
                    p = fragment_fromstring('<p/>')
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)

                if child.tag == 'br':
                    child.drop_tree()
Exemplo n.º 6
0
def test_class_hits():
    """If the class is in the list then it gets a weight"""
    test_div = '<div class="something post">Content</div>'
    node = fragment_fromstring(test_div)
    assert get_class_weight(node) == 25

    test_div = '<div class="something comments">Content</div>'
    node = fragment_fromstring(test_div)
    assert get_class_weight(node) == -25
Exemplo n.º 7
0
    def test_is_unlikely(self):
        "Keywords in the class/id will make us believe this is unlikely."
        test_div = '<div class="something comments">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertTrue(is_unlikely_node(node))

        test_div = '<div id="comments">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertTrue(is_unlikely_node(node))
Exemplo n.º 8
0
    def test_not_unlikely(self):
        """Suck it double negatives."""
        test_div = '<div id="post">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertFalse(is_unlikely_node(node))

        test_div = '<div class="something post">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertFalse(is_unlikely_node(node))
Exemplo n.º 9
0
    def test_class_hits(self):
        """If the class is in the list then it gets a weight"""
        test_div = '<div class="something post">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertEqual(get_class_weight(node), 25)

        test_div = '<div class="something comments">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertEqual(get_class_weight(node), -25)
Exemplo n.º 10
0
    def _summary(self, enclose_with_html_tag=True):
        # the first page parsed into a elementree element
        doc = self.html

        # the set of urls we've processed so far
        parsed_urls = set()
        url = self.options.get("url", None)
        if url is not None:
            parsed_urls.add(url)

        # check the current doc for a next page if requested
        if self.options.get("multipage", False):
            next_page_url = find_next_page_url(parsed_urls, url, doc)

            page_0 = get_article(doc, self.options)
            page_0_doc = fragment_fromstring(page_0.html)
            page_index = 0
            make_page_elem(page_index, page_0_doc)

            if enclose_with_html_tag:
                output = document_fromstring("<div/>")
                output.getchildren()[0].attrib["id"] = "article"
                output.getchildren()[0].append(page_0_doc)
            else:
                output = fragment_fromstring("<div/>")
                output.attrib["id"] = "article"
                output.append(page_0_doc)

            if next_page_url is not None:
                append_next_page(parsed_urls, page_index + 1, next_page_url, output, self.options)
            return Summary(
                tostring(output),
                page_0.confidence,
                short_title=shorten_title(output),
                title=get_title(output),
                description=get_description(output),
                keywords=get_keywords(output),
            )

        summary = get_article(doc, self.options, enclose_with_html_tag=enclose_with_html_tag)
        print(len(summary.html), "============================")
        if summary.title == "[something-wrong]" or len(summary.html) < 500:
            output = parse(self.input_doc, self.options.get("url"))
            remove_unlikely_candidates(output)
            o = open("something-wrong.txt", "w")
            print("[something-wrong]", tostring(output), file=o)
            return Summary(
                get_clean_html(output),
                0,
                short_title=shorten_title(output),
                title=get_title(output),
                description=get_description(output),
                keywords=get_keywords(output),
            )
        else:
            return summary
Exemplo n.º 11
0
    def test_equal_hashes(self):
        dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
        dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
        hash_dom1 = generate_hash_id(dom1)
        hash_dom2 = generate_hash_id(dom2)
        self.assertEqual(hash_dom1, hash_dom2)

        hash_none1 = generate_hash_id(None)
        hash_none2 = generate_hash_id(None)
        self.assertEqual(hash_none1, hash_none2)
Exemplo n.º 12
0
def test_equal_hashes():
    dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
    dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
    hash_dom1 = generate_hash_id(dom1)
    hash_dom2 = generate_hash_id(dom2)
    assert hash_dom1 == hash_dom2

    hash_none1 = generate_hash_id(None)
    hash_none2 = generate_hash_id(None)
    assert hash_none1 == hash_none2
Exemplo n.º 13
0
def test_id_hits():
    """If the id is in the list then it gets a weight"""
    test_div = '<div id="post">Content</div>'
    node = fragment_fromstring(test_div)

    assert get_class_weight(node) == 25

    test_div = '<div id="comments">Content</div>'
    node = fragment_fromstring(test_div)

    assert get_class_weight(node) == -25
Exemplo n.º 14
0
def insert_into_last_element(html, element):
    """
    function to insert an html element into another html fragment
    example:
        html = '<p>paragraph1</p><p>paragraph2...</p>'
        element = '<a href="/read-more/">read more</a>'
        ---> '<p>paragraph1</p><p>paragraph2...<a href="/read-more/">read more</a></p>'
    """
    try:
        item = fragment_fromstring(element)
    except ParserError, TypeError:
        item = fragment_fromstring('<span></span>')
Exemplo n.º 15
0
    def test_scores_collide(self):
        """We might hit both positive and negative scores.

        Positive and negative scoring is done independently so it's possible
        to hit both positive and negative scores and cancel each other out.

        """
        test_div = '<div id="post" class="something comment">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertEqual(get_class_weight(node), 0)

        test_div = '<div id="post" class="post comment">Content</div>'
        node = fragment_fromstring(test_div)
        self.assertEqual(get_class_weight(node), 25)
Exemplo n.º 16
0
def format_comments(comments=None, article_id=None):
	template_data = {
		  'user_activity': '',
		  'article_id': article_id,}
	comment_box = ('<form class="comment-form" name="comment-form" action="/comment-on?id=%s" method="post">'
				  '<textarea class="comment-text" name="comment-text" title="add your comment..."></textarea>'
				  '</form>' % article_id)
#todo - build comment tree by replacing and adding.
#todo - add report abuse.
	path = os.path.join(os.path.dirname(__file__), 'comment-table-template.html' )
	all_comments = '<div class="below-video comments">Comments:<table>'
	template_data.update({'comment_id': len(comments)})
	tree = fragment_fromstring(template.render(path, template_data), create_parent=False)
	all_comments += tostring(tree.xpath('//tfoot')[0])#needs better element addressing
	all_comments += '<tbody id="comment-table-' + str(article_id) + '">'
	comment_id = 0
	user = oAuthUsers.get_current_user()
	for comment in comments:
		nickname = str(loads(str(comment))[1]).split('@',2)[0]
		dispNickname = nickname
		if user:
			#The display nickname will break the code to comment, leave as is
			#if the author actually matches up
			if user.isAuthor(comment=comment):
				dispNickname = nickname
			#Make it obvious who is the owner
			elif dispNickname != '':
				dispNickname = '['+nickname+']'
		template_data.update({
			'comment_id': str(comment_id),
			'comment_display': loads(str(comment))[0],
			'nickname': dispNickname,
			'comment_date': loads(str(comment))[2],
			'time_now': datetime.now(),
			'user_url': 'by-author?author='+urllib.quote(nickname),
			})
		tree = fragment_fromstring(template.render(path, template_data), create_parent=False)
		if nickname != '':
			all_comments += tostring(tree.xpath('//tr')[1])
		else:
			all_comments += tostring(tree.xpath('//tr')[2]) #deleted comment tr
		comment_id += 1
	
	#place an empty hidden comment last
	template_data.update({'comment_id': len(comments)})
	tree = fragment_fromstring(template.render(path, template_data), create_parent=False)
	all_comments += tostring(tree.xpath('//tr')[3]) #hidden comment tr
	all_comments += '</tbody></table></div>'
	return all_comments
Exemplo n.º 17
0
 def adjust_dom(cls, root):
     """ adjust paged dom.
         1. add id for navigationBar
         2. generate shadow node of navigationBar
     """
     i = 1
     for child in root.find_class("dnavb"):
         child.set("id", "%s_%d" % ("dnavb", i))
         child.set("class", child.get("class", "") + " dnavh")
         shadow = p.fragment_fromstring('<div class="dnavg show"></div>')
         child.insert(0, shadow)
         for anchor in child.findall(".//a")[:3]:
             shadow.append(copy.deepcopy(anchor))
         shadow.append(p.fragment_fromstring("<a>...</a>"))
         i += 1
Exemplo n.º 18
0
def test_kwargs():
    template = '''
        {% activeurl parent_tag='div' css_class='current' %}
            <div>
                <div>
                    <a href="/other_page/">other_page</a>
                </div>
                <div>
                    <a href="/page/">page</a>
                </div>
            </div>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    div_elements = tree.xpath('//div')

    active_div = div_elements[-1]

    assert active_div.attrib.get('class', False)
    assert 'current' == active_div.attrib['class']

    for inactive_div in div_elements[:-1]:
        assert not inactive_div.attrib.get('class', False)
Exemplo n.º 19
0
    def asHTML(self):
        # The network events portlet is different.  Everything is different.
        portlet = fragment_fromstring('<div class="generic-portlet"/>')
        heading = SubElement(portlet, 'h3')
        heading.text = "Staff Calendar"

        # Now the entries
        entries = self.entries
        if entries:
            ul = SubElement(portlet, 'ul', id='calendar_portlet')
            event_style = 'text-decoration:none'
            date_format = '%m/%d/%Y' #'%A, %B %d, %Y %I:%M %p'
            for entry in self.entries:
                li = SubElement(ul, 'li')

                span1 = SubElement(li, 'span')
                span1.text = entry['startDate'].strftime(date_format)
                span2 = SubElement(li, 'span')
                span2.set('class', 'event_title')
                a = SubElement(span2, 'a',
                               href=entry['href'],
                               style=event_style)
                a.text = entry['title']
        else:
            msg = SubElement(portlet, 'p')
            msg.text = "No entries found"

        # Close out with the more link
        more = SubElement(portlet, 'p')
        more.set('class', 'more')
        more_a = SubElement(more, 'a', href=self.href)
        more_a.text = 'MORE' 

        return tostring(portlet, pretty_print=True)
Exemplo n.º 20
0
 def summary(self):
     doc = self._html(True)
     parsed_urls = set()
     url = self.options['url']
     if url is not None:
         parsed_urls.add(url)
     page_0 = get_article(doc, self.options)
     if page_0.html:
         # we fetch page_0 only for now.
         return page_0
     next_page_url = find_next_page_url(parsed_urls, url, doc)
     page_0_doc = fragment_fromstring(page_0.html)
     page_index = 0
     make_page_elem(page_index, page_0_doc)
     article_doc = B.DIV(page_0_doc)
     article_doc.attrib['id'] = 'article'
     if next_page_url is not None:
         append_next_page(
                 get_article,
                 parsed_urls,
                 page_index + 1,
                 next_page_url,
                 article_doc,
                 self.options
                 )
     return Summary(page_0.confidence, tostring(article_doc))
Exemplo n.º 21
0
def citation2latex(s):
    """Parse citations in Markdown cells.
    
    This looks for HTML tags having a data attribute names `data-cite`
    and replaces it by the call to LaTeX cite command. The tranformation
    looks like this:
    
    `<cite data-cite="granger">(Granger, 2013)</cite>`
    
    Becomes
    
    `\\cite{granger}`
    
    Any HTML tag can be used, which allows the citations to be formatted
    in HTML in any manner.
    """
    try:
        from lxml import html
    except ImportError:
        return s

    tree = html.fragment_fromstring(s, create_parent='div')
    _process_node_cite(tree)
    s = html.tostring(tree)
    if s.endswith('</div>'):
        s = s[:-6]
    if s.startswith('<div>'):
        s = s[5:]
    return s
Exemplo n.º 22
0
    def asHTML(self):
        """Use lxml to generate a customizable via adapter representation"""

        portlet = fragment_fromstring('<div class="generic-portlet""/>')
        heading = SubElement(portlet, 'h3')
        heading.text = self.context.title

        # Now the entries
        entries = self.entries
        if entries:
            for entry in self.entries:
                item = SubElement(portlet, 'p')
                item_a = SubElement(item, 'a', href=entry['href'])
                item_a.text = entry['title']
        else:
            msg = SubElement(portlet, 'p')
            msg.text = "No entries found"

        # Close out with the more link
        more = SubElement(portlet, 'p')
        more.set('class', 'more')
        more_a = SubElement(more, 'a', href=self.href)
        more_a.text = 'MORE ' + self.title

        return tostring(portlet, pretty_print=True)
Exemplo n.º 23
0
def test_no_parent_submenu():
    template = '''
        {% activeurl parent_tag='self' %}
            <div>
                <a href="/menu/">menu</a>
                <hr>
                <a href="/menu/submenu/">submenu</a>
                <hr>
                <a href="/menu/other_submenu/">other_submenu</a>
            </div>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/menu/submenu/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    a_elements = tree.xpath('//a')

    active_menu = a_elements[0]

    assert active_menu.attrib.get('class', False)
    assert 'active' == active_menu.attrib['class']

    active_submenu = a_elements[1]

    assert active_submenu.attrib.get('class', False)
    assert 'active' == active_submenu.attrib['class']

    inactive_submenu = a_elements[2]

    assert not inactive_submenu.attrib.get('class', False)
Exemplo n.º 24
0
def get_article(candidates, best_candidate):
    # Now that we have the top candidate, look through its siblings for
    # content that might also be related.
    # Things like preambles, content split by ads that we removed, etc.
    sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
    # create a new html document with a html->body->div
    output = fragment_fromstring('<div/>')
    best_elem = best_candidate['elem']
    for sibling in best_elem.getparent().getchildren():
        # in lxml there no concept of simple text
        # if isinstance(sibling, NavigableString): continue
        append = False
        if sibling is best_elem:
            append = True
        sibling_key = sibling  # HashableElement(sibling)
        if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
            append = True

        if sibling.tag == "p":
            link_density = get_link_density(sibling)
            node_content = sibling.text or ""
            node_length = len(node_content)

            if node_length > 80 and link_density < 0.25:
                append = True
            elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
                append = True

        if append:
            # We don't want to append directly to output, but the div
            # in html->body->div
            output.append(sibling)
    #if output is not None:
    #    output.append(best_elem)
    return output
Exemplo n.º 25
0
def test_non_active_root():
    template = '''
        {% activeurl %}
            <ul>
                <li>
                    <a href="/">root</a>
                </li>
                <li>
                    <a href="/page/">page</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    inactive_li = li_elements[0]

    assert not inactive_li.attrib.get('class', False)

    active_li = li_elements[1]

    assert 'active' == active_li.attrib['class']
Exemplo n.º 26
0
def test_submenu_no_menu():
    template = '''
        {% activeurl menu='no' %}
            <ul>
                <li>
                    <a href="/menu/submenu/">submenu</a>
                </li>
                <li>
                    <a href="/menu/other_submenu/">other_submenu</a>
                </li>
                <li>
                    <a href="/menu/">menu</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/menu/submenu/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    active_menu = li_elements[0]

    assert active_menu.attrib.get('class', False)
    assert 'active' == active_menu.attrib['class']

    for inactive_submenu in li_elements[1:]:
        assert not inactive_submenu.attrib.get('class', False)
Exemplo n.º 27
0
def test_kwargs_multiple_urls():
    template = '''
        {% activeurl parent_tag='p' css_class='highlight' %}
            <div>
                <p>
                    <a href="/other_page/">other_page</a>
                </p>
                <p>
                    <a href="/page/">page</a>
                    <br>
                    <a href="/other_page/">other_page</a>
                </p>
            </div>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    p_elements = tree.xpath('//p')

    active_p = p_elements[1]

    assert active_p.attrib.get('class', False)
    assert 'highlight' == active_p.attrib['class']

    inactive_p = p_elements[0]

    assert not inactive_p.attrib.get('class', False)
Exemplo n.º 28
0
def test_basic_again_test_default_settings():
    template = '''
        {% activeurl %}
            <ul>
                <li>
                    <a href="/page/">page</a>
                </li>
                <li>
                    <a href="/other_page/">other_page</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    active_li = li_elements[0]

    assert active_li.attrib.get('class', False)
    assert 'active' == active_li.attrib['class']

    inactive_li = li_elements[1]

    assert not inactive_li.attrib.get('class', False)
Exemplo n.º 29
0
def test_disabled_menu_root_path():
    template = '''
        {% activeurl menu='no' %}
            <ul>
                <li>
                    <a href="/">root</a>
                </li>
                <li>
                    <a href="/other_page/">other_page</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    active_li = li_elements[0]

    assert active_li.attrib.get('class', False)
    assert 'active' == active_li.attrib['class']

    inactive_li = li_elements[1]

    assert not inactive_li.attrib.get('class', False)
Exemplo n.º 30
0
def plain2(text):
    he = fragment_fromstring(text, create_parent="div")

    for tag in he.iterdescendants():
        tag.drop_tag()

    return he.text
Exemplo n.º 31
0
def html_to_lxml(raw):
    raw = '<div>%s</div>' % raw
    root = html.fragment_fromstring(raw)
    root.set('xmlns', "http://www.w3.org/1999/xhtml")
    raw = etree.tostring(root, encoding=None)
    try:
        return safe_xml_fromstring(raw, recover=False)
    except:
        for x in root.iterdescendants():
            remove = []
            for attr in x.attrib:
                if ':' in attr:
                    remove.append(attr)
            for a in remove:
                del x.attrib[a]
        raw = etree.tostring(root, encoding=None)
        try:
            return safe_xml_fromstring(raw, recover=False)
        except:
            from calibre.ebooks.oeb.parse_utils import _html4_parse
            return _html4_parse(raw)
Exemplo n.º 32
0
def test_empty_css_class():
    template = '''
        {% activeurl %}
            <ul>
                <li class="">
                    <a href="/page/">page</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    active_li = li_elements[0]

    assert active_li.attrib.get('class', False)
    assert 'active' == active_li.attrib['class']
def read_homer_table(fn):
    par_dir = os.path.dirname(os.path.realpath(fn))
    with open(fn, 'r') as f:
        data = ''.join(f.readlines())
        soup = bs.BeautifulSoup(data, 'lxml')
        table = soup.find('table')
        homer_table = str(table).replace('homerResults',
                                         os.path.join(par_dir, 'homerResults'))
    html_table = html.fragment_fromstring(homer_table)
    top_row = 5
    row_counter = 0
    for row in html_table.iterchildren():
        row_counter += 1
        row.remove(row.getchildren()[-1])
        row.remove(row.getchildren()[-1])
        if row_counter >= top_row:
            row.clear()

    html_table = str(
        html.tostring(html_table, encoding='unicode', with_tail=False))
    return html_table
Exemplo n.º 34
0
def test_nested_submenu():
    template = '''
        {% activeurl parent_tag="div" %}
            <div>
                <div>
                    <a href="/menu/">menu</a>
                    <div>
                        <a href="/menu/submenu/">submenu</a>
                    </div>
                    <div>
                        <a href="/menu/other_submenu/">other_submenu</a>
                    </div>
                </div>
            </div>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/menu/submenu/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    div_elements = tree.xpath('//div')

    active_menu = div_elements[1]

    assert active_menu.attrib.get('class', False)
    assert 'active' == active_menu.attrib['class']

    active_submenu = div_elements[2]

    assert active_submenu.attrib.get('class', False)
    assert 'active' == active_submenu.attrib['class']

    inactive_submenu = div_elements[3]

    assert not inactive_submenu.attrib.get('class', False)

    inactive_root = div_elements[0]

    assert not inactive_root.attrib.get('class', False)
Exemplo n.º 35
0
    def asHTML(self):
        # The network events portlet is different.  Everything is different.
        portlet = fragment_fromstring('<div class="generic-portlet"/>')
        heading = SubElement(portlet, 'h3')
        heading.text = self.context.title

        # Now the entries
        entries = self.entries
        if entries:
            ul = SubElement(portlet, 'ul', id='events_portlet')
            event_style = 'text-decoration:none'
            date_format = '%m/%d/%Y'  #'%A, %B %d, %Y %I:%M %p'
            for entry in self.entries:
                li = SubElement(ul, 'li')

                #tr = SubElement(table, 'tr')
                #td = SubElement(tr, 'td')
                #td.set('class', 'event_title')
                span1 = SubElement(li, 'span')
                span1.text = entry['startDate'].strftime(date_format)
                span1.set('class', 'globalize-short-date')
                span2 = SubElement(li, 'span')
                span2.set('class', 'event_title')
                a = SubElement(span2,
                               'a',
                               href=entry['href'],
                               style=event_style)
                a.text = entry['title']
                #td2 = SubElement(tr, 'td')
        else:
            msg = SubElement(portlet, 'p')
            msg.text = "No entries found"

        # Close out with the more link
        more = SubElement(portlet, 'p')
        more.set('class', 'more')
        more_a = SubElement(more, 'a', href=self.href)
        more_a.text = 'MORE'

        return tostring(portlet, pretty_print=True)
Exemplo n.º 36
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     self.internal_toc_entries = ()
     if author and not isinstance(author, str):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, str):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding='unicode')
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = ''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Exemplo n.º 37
0
def test_ignore_href_only_hash():
    template = '''
        {% activeurl %}
            <ul>
                <li>
                    <a href="#">page</a>
                </li>
                <li>
                    <a href="/other_page/">other_page</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/?foo=bar&bar=foo')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    assert not li_elements[0].attrib.get('class', False)
    assert not li_elements[1].attrib.get('class', False)
Exemplo n.º 38
0
    def get_value(self, context):
        html = self.src(context)
        html_root = fragment_fromstring(html, create_parent=True)
        selector = CSSSelector("h1,h2,h3,h4,h5,h6,h7")

        root = [{"level": 0, "children": []}]

        for h in selector(html_root):
            if not h.text:
                continue
            level = int(h.tag.decode("utf-8")[1:])
            title = h.text
            if not isinstance(title, text_type):
                title = title.decode("utf-8")

            depth = root
            while depth and level > depth[-1]["level"]:
                depth = depth[-1]["children"]

            depth.append({"level": level, "title": title, "children": []})

        return root[0]["children"]
Exemplo n.º 39
0
def build_base_document(html, fragment=True):
    """Return a base document with the body as root.

    :param html: Parsed Element object
    :param fragment: Should we return a <div> doc fragment or a full <html>
    doc.

    """
    if html.tag == 'body':
        html.tag = 'div'
        found_body = html
    else:
        found_body = html.find('.//body')

    if found_body is None:
        frag = fragment_fromstring('<div/>')
        frag.set('id', 'readabilityBody')
        frag.append(html)

        if not fragment:
            output = fromstring(BASE_DOC)
            insert_point = output.find('.//body')
            insert_point.append(frag)
        else:
            output = frag
    else:

        found_body.tag = 'div'
        found_body.set('id', 'readabilityBody')

        if not fragment:
            output = fromstring(BASE_DOC)
            insert_point = output.find('.//body')
            insert_point.append(found_body)
        else:
            output = found_body

    output.doctype = "<!DOCTYPE html>"
    return output
Exemplo n.º 40
0
def test_html_simplest():
    parts = {
        'index':
        trim("""
            My Document!

            It contains a #[Tag] and a %[Tag].

            ` Part One
        """),
        'part-one':
        trim("""
            Or an #[alias: Tag, subtag]?
        """),
    }
    outline = Outline(parts, default_counters())
    index = Index(outline)
    index.tags = {'tag': {'subtag': {'1.1': ['LINK']}}}
    out = index.html()
    dom = html.fragment_fromstring(out, create_parent='body')[0]
    assert len(dom.cssselect('div.indent-first-line')) == 1
    assert 'LINK' in out
Exemplo n.º 41
0
    def fix_links(self,content, absolute_prefix):
        """
        Rewrite relative links to be absolute links based on certain URL.

        @param content: HTML snippet as a string
        """

        if type(content) == str:
            content = content.decode("utf-8")

        parser = etree.HTMLParser()

        content = content.strip()

        tree  = html.fragment_fromstring(content, create_parent=True)

        def join(base, url):
            """
            Join relative URL
            """
            if not (url.startswith("/") or "://" in url):
                return urlparse.urljoin(base, url)
            else:
                # Already absolute
                return url

        for node in tree.xpath('//*[@src]'):
            url = node.get('src')
            url = join(absolute_prefix, url)
            node.set('src', url)
        for node in tree.xpath('//*[@href]'):
            href = node.get('href')
            url = join(absolute_prefix, href)
            node.set('href', url)

        data =  etree.tostring(tree, pretty_print=False, encoding="utf-8")

        return data
Exemplo n.º 42
0
def test_kwargs_multiple_urls_nested_tags():
    template = '''
        {% activeurl parent_tag='tr' css_class='active_row' %}
            <div>
                <table>
                    <tr>
                        <td>
                            <a href="/page/">page</a>
                        </td>
                        <td>
                            <a href="/other_page/">other_page</a>
                        </td>
                    </tr>
                    <tr>
                        <td>
                            <a href="/other_page/">other_page</a>
                        </td>
                    </tr>
                </table>
            </div>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/page/')}
    html = render(template, context)

    tree = fragment_fromstring(html)

    tr_elements = tree.xpath('//tr')

    active_tr = tr_elements[0]

    assert active_tr.attrib.get('class', False)
    assert 'active_row' == active_tr.attrib['class']

    inactive_tr = tr_elements[1]

    assert not inactive_tr.attrib.get('class', False)
Exemplo n.º 43
0
 def exportSolution(self, parent, solution):
     """:returns: An XML node with the details
     of an :obj:`euphorie.content.solution`."""
     node = etree.SubElement(parent, "solution")
     if getattr(solution, "external_id", None):
         node.attrib["external-id"] = solution.external_id
     etree.SubElement(node, "description").text = StripUnwanted(
         solution.description)
     stripped_action = StripUnwanted(solution.action)
     if ISolution.providedBy(solution) and self.is_etranslate_compatible:
         solution_view = api.content.get_view(context=solution,
                                              name="nuplone-view",
                                              request=self.request)
         action_with_br = stripped_action.replace("\n", "<br/>")
         action_html = solution_view.render_md(action_with_br)
         fragment = html.fragment_fromstring(action_html, "action")
         node.append(fragment)
     else:
         etree.SubElement(node, "action").text = stripped_action
     if solution.requirements:
         etree.SubElement(node, "requirements").text = StripUnwanted(
             solution.requirements)
     return node
Exemplo n.º 44
0
def append_next_page(get_article_func, parsed_urls, page_index, page_url, doc,
                     options):
    logging.debug('appending next page: %s' % page_url)

    if page_index >= MAX_PAGES:
        return

    fetcher = options['urlfetch']
    try:
        html = fetcher.urlread(page_url)
    except Exception as e:
        logging.warning('exception fetching %s' % page_url, exc_info=True)
        return
    orig_page_doc = parse(html, page_url)
    next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
    page_article = get_article_func(orig_page_doc, options)
    page_doc = fragment_fromstring(page_article.html)
    make_page_elem(page_index, page_doc)
    if not is_suspected_duplicate(doc, page_doc):
        doc.append(page_doc)
        if next_page_url is not None:
            append_next_page(get_article_func, parsed_urls, page_index + 1,
                             next_page_url, doc, options)
Exemplo n.º 45
0
    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for
        # content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.
        sibling_score_threshold = max(10, best_candidate.score * 0.2)
        # create a new html document with a div
        output = fragment_fromstring("<div/>")
        parent = best_candidate.elem.getparent()
        siblings = parent.getchildren() if parent is not None else [
            best_candidate.elem
        ]
        for sibling in siblings:
            # in lxml there no concept of simple text
            # if isinstance(sibling, NavigableString): continue
            append = False
            # conditions
            if sibling == best_candidate.elem:
                append = True
            elif (sibling in candidates
                  and candidates[sibling].score >= sibling_score_threshold):
                append = True
            elif sibling.tag == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.text or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif (node_length <= 80 and link_density == 0
                      and re.search(r"\.( |$)", node_content)):
                    append = True
            # append to the output div
            if append:
                output.append(sibling)
        #if output is not None:
        #    output.append(best_candidate.elem)
        return output
Exemplo n.º 46
0
    def parse(self,
              source,
              classname=DEFAULT_CLASS_NAME,
              use_cache=True,
              language=''):
        """Parses input HTML code into word chunks and organized code.

    Args:
      source: HTML code to be processed (unicode).
      classname: A class name of each word chunk in the HTML code (string).
      user_cache: Whether to use cache (boolean).
      language: A language used to parse text (string).

    Returns:
      A dictionary with the list of word chunks and organized HTML code.
    """
        if use_cache:
            cache_shelve = shelve.open(CACHE_FILE_NAME)
            cache_key = self._get_cache_key(source, classname)
            result_value = cache_shelve.get(cache_key, None)
            cache_shelve.close()
            if result_value: return result_value
        source = self._preprocess(source)
        dom = html.fragment_fromstring(source, create_parent='body')
        input_text = dom.text_content()
        chunks = self._get_source_chunks(input_text, language)
        chunks = self._concatenate_punctuations(chunks)
        chunks = self._concatenate_by_label(chunks, True)
        chunks = self._concatenate_by_label(chunks, False)
        chunks = self._migrate_html(chunks, dom)
        html_code = self._spanize(chunks, classname)
        result_value = {'chunks': chunks, 'html_code': html_code}
        if use_cache:
            cache_shelve = shelve.open(CACHE_FILE_NAME)
            cache_shelve[cache_key] = result_value
            cache_shelve.close()
        return result_value
Exemplo n.º 47
0
def parse_angaben(engine, data):
    if not data.get('angaben'):
        return
    snippet = '<x>' + data['angaben'] + '</x>'
    doc = html.fragment_fromstring(snippet)
    table = sl.get_table(engine, 'angaben')
    data = {'source_url': data['source_url']}
    wrapped_name = False
    for el in doc:
        if el.tag == 'h3':
            wrapped_name = False
            data['section'] = el.text.split('. ', 1)[-1]
        elif el.tag == 'strong' or not el.text or not el.get('class'):
            continue
        elif 'voa_abstand' in el.get('class') or wrapped_name:
            client = el.text
            if wrapped_name:
                client = data['client'] + ' ' + client
            data['client'] = client
            client.strip().strip(',')
            els = client.rsplit(',', 2)
            if len(els) == 3:
                wrapped_name = False
                data['client_name'] = els[0].strip()
                data['client_city'] = els[1].strip()
            else:
                wrapped_name = True
                continue
        else:
            data['service'] = el.text
            data['level'] = 'Stufe 0'
            for name in LEVELS:
                if name.lower() in data['service'].lower():
                    data['level'] = name
            sl.upsert(engine, table, data,
                ['source_url', 'section', 'client', 'service'])
Exemplo n.º 48
0
def data_from_table(table):
    data_tables = table.xpath("table[@align='center']")
    stops_table = data_tables[0]
    time_table = data_tables[1]
    stops = [tr[1].text_content() for tr in stops_table.xpath("tr")]
    start_times = [
        int(td.text_content().replace(":", ""))
        for td in time_table.xpath("tr/td")
        if td.text_content().replace(":", "") != ''
    ]

    header = table.xpath("div[@class='enTripGroupInfo']")[0]
    header_text = html.fragment_fromstring(
        html.tostring(header).replace("\n", " ").replace(
            "<br>",
            "\n")).text_content().replace('To\n',
                                          'To').replace('-\n',
                                                        '-').replace('-', '')
    data_lines = [h.strip() for h in header_text.split('\n')]
    data = {'stops': stops, 'start_times': sorted(start_times)}
    for dl in data_lines:
        s = dl.split(":")
        data[s[0].strip()] = s[1].strip()
    return data
Exemplo n.º 49
0
def sanitize_richtext(text):
    if defaults.DJANGOCMS_BASEPLUGINS_LXML_CLEANER_CONFIG:
        if lxml_clean:
            lxml_cleaner = lxml_clean.Cleaner(
                **defaults.DJANGOCMS_BASEPLUGINS_LXML_CLEANER_CONFIG)
            fragment = fragment_fromstring("<div>" + text + "</div>")
            fragment = lxml_cleaner.clean_html(fragment)
            text = tostring(fragment, encoding='unicode')
            if text.startswith('<div>'):
                # still dont like lxml!
                text = text[len('<div>'):-len('</div>')]
        elif settings.DEBUG:
            print(
                "lxml is not installed, but should be, for sanitizing richtext content!"
            )
    if defaults.DJANGOCMS_BASEPLUGINS_BLEACH_CONFIG:
        if bleach:
            text = bleach.clean(text,
                                **defaults.DJANGOCMS_BASEPLUGINS_BLEACH_CONFIG)
        elif settings.DEBUG:
            print(
                "bleach is not installed, but should be, for sanitizing richtext content!"
            )
    return text
Exemplo n.º 50
0
    def generate_markup_fragment(
        name_dict={'en': 'English Output'},
        langs=['en'],
        url='test-url',
        saveValueName='saveValue',
        postSave='postSave',
        containerClass='containerClass',
        iconClass='iconClass',
        readOnlyClass='readOnlyClass',
        disallow_edit='false'
    ):
        markup = inline_edit_trans(
            name_dict,
            langs,
            url,
            saveValueName,
            postSave,
            containerClass,
            iconClass,
            readOnlyClass,
            disallow_edit
        )

        return fragment_fromstring(markup)
Exemplo n.º 51
0
def test_submenu():
    template = '''
        {% activeurl %}
            <ul>
                <li>
                    <a href="/menu/">menu</a>
                </li>
                <li>
                    <a href="/menu/submenu/">submenu</a>
                </li>
                <li>
                    <a href="/menu/other_submenu/">other_submenu</a>
                </li>
            </ul>
        {% endactiveurl %}
    '''

    context = {'request': requests.get('/menu/submenu/')}
    html = render(template, context)

    tree = fragment_fromstring(html)
    li_elements = tree.xpath('//li')

    active_menu = li_elements[0]

    assert active_menu.attrib.get('class', False)
    assert 'active' == active_menu.attrib['class']

    active_submenu = li_elements[1]

    assert active_submenu.attrib.get('class', False)
    assert 'active' == active_submenu.attrib['class']

    inactive_submenu = li_elements[2]

    assert not inactive_submenu.attrib.get('class', False)
Exemplo n.º 52
0
def response(resp):
    results = []
    xmldom = etree.fromstring(resp.content)
    xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0)
    dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div')
    for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'):
        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)
        thumbnail_src = urljoin(
            gallery_url,
            eval_xpath_getindex(link, './/img', 0).attrib['src'])

        # append result
        results.append({
            'url': url,
            'title': title,
            'img_src': thumbnail_src,
            'content': '',
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    # return results
    return results
Exemplo n.º 53
0
    def _build_datatable(self, response):
        alist_tbody = (response.xpath('//table[1]/tbody//td').extract())

        atable = []
        arow = []

        for item in alist_tbody:
            tree = html.fragment_fromstring(item)
            text = tree.text_content()

            url = tree.xpath('//a/@href')
            find_att_b = tree.xpath('//b/text()|//strong/text()')
            if len(find_att_b) >= 1:
                continue
            if url:
                arow.append('{name}: {url}'.format(name=text, url=url[0]))
            else:
                arow.append(
                    '{text}'.format(text=unicodedata.normalize("NFKD", text)))
            if len(arow) == 6:
                atable.append(arow)
                arow = []

        return atable
Exemplo n.º 54
0
def load_generic_data():
    html_data = requests.get('http://www.fundamentus.com.br/resultado.php')

    pattern = re.compile('<table id="resultado".*</table>', re.DOTALL)
    [table] = re.findall(pattern, html_data.text)
    page = fragment_fromstring(table)

    [thead] = page.xpath('thead')
    [tr] = thead.xpath('tr')
    headers = [th.text_content().strip() for th in tr.xpath('th')]

    [tbody] = page.xpath('tbody')

    stock_info = {}

    for tr in tbody.xpath('tr'):
        data = [
            _convert_data(i.text_content().strip()) for i in tr.xpath('td')
        ]
        stock_data = dict(zip(headers, data))
        tick = stock_data['Papel']
        stock_info[tick] = stock_data

    return stock_info
Exemplo n.º 55
0
def get_specific_data(stock):
    url = "http://www.fundamentus.com.br/detalhes.php?papel=" + stock
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(
        urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [
        ('User-agent',
         'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
        ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')
    ]

    # Get data from site
    link = opener.open(url, urllib.parse.urlencode({}).encode('UTF-8'))
    content = link.read().decode('ISO-8859-1')

    # Get all table instances
    pattern = re.compile('<table class="w728">.*</table>', re.DOTALL)
    reg = re.findall(pattern, content)[0]
    reg = "<div>" + reg + "</div>"
    page = fragment_fromstring(reg)
    all_data = {}

    # There is 5 tables with tr, I will get all trs
    all_trs = []
    all_tables = page.xpath("table")

    for i in range(0, len(all_tables)):
        all_trs = all_trs + all_tables[i].findall("tr")

    # Run through all the trs and get the label and the
    # data for each line
    for tr_index in range(0, len(all_trs)):
        tr = all_trs[tr_index]
        # Get into td
        all_tds = tr.getchildren()
        for td_index in range(0, len(all_tds)):
            td = all_tds[td_index]

            label = ""
            data = ""

            # The page has tds with contents and some
            # other with not
            if (td.get("class").find("label") != -1):
                # We have a label
                for span in td.getchildren():
                    if (span.get("class").find("txt") != -1):
                        label = span.text

                # If we did find a label we have to look
                # for a value
                if (label and len(label) > 0):
                    next_td = all_tds[td_index + 1]

                    if (next_td.get("class").find("data") != -1):
                        # We have a data
                        for span in next_td.getchildren():
                            if (span.get("class").find("txt") != -1):
                                if (span.text):
                                    data = span.text
                                else:
                                    # If it is a link
                                    span_children = span.getchildren()
                                    if (span_children
                                            and len(span_children) > 0):
                                        data = span_children[0].text

                                # Include into dict
                                all_data[label] = data

                                # Erase it
                                label = ""
                                data = ""

    return all_data
Exemplo n.º 56
0
def parse_html(text):
    return html.fragment_fromstring(text, parser=_HTML_PARSER)
Exemplo n.º 57
0
    def build_candidates(length):
        html = "<p>%s</p>" % ("c" * length)
        node = fragment_fromstring(html)

        return [node]
Exemplo n.º 58
0
def _markdown_fragment(target, image):
    if not image:
        images_left = 0
    elif type(image) is int:
        images_left = image
    else:
        images_left = 5

    rendered = _markdown(target)
    fragment = html.fragment_fromstring(rendered, create_parent=True)

    for link in fragment.findall(".//a"):
        href = link.attrib.get("href")

        if href:
            t, _, user = href.partition(":")

            if t == "user":
                link.attrib["href"] = u"/~{user}".format(user=login_name(user))
            elif t == "da":
                link.attrib["href"] = u"https://{user}.deviantart.com/".format(
                    user=_deviantart(user))
            elif t == "ib":
                link.attrib["href"] = u"https://inkbunny.net/{user}".format(
                    user=_inkbunny(user))
            elif t == "fa":
                link.attrib[
                    "href"] = u"https://www.furaffinity.net/user/{user}".format(
                        user=_furaffinity(user))
            elif t == "sf":
                link.attrib["href"] = u"https://{user}.sofurry.com/".format(
                    user=_sofurry(user))
            else:
                continue

            if not link.text or link.text == href:
                link.text = user

    for parent in fragment.findall(".//*[img]"):
        for image in list(parent):
            if image.tag != "img":
                continue

            src = image.get("src")

            if src:
                t, _, user = src.partition(":")

                if t != "user":
                    if images_left:
                        images_left -= 1
                    else:
                        i = list(parent).index(image)
                        link = etree.Element(u"a")
                        link.tail = image.tail
                        src = image.get("src")

                        if src:
                            link.set(u"href", src)
                            link.text = image.attrib.get("alt", src)

                        parent[i] = link

                    continue

                image.set(u"src",
                          u"/~{user}/avatar".format(user=login_name(user)))

                link = etree.Element(u"a")
                link.set(u"href", u"/~{user}".format(user=login_name(user)))
                link.set(u"class", u"user-icon")
                parent.insert(list(parent).index(image), link)
                parent.remove(image)
                link.append(image)
                link.tail = image.tail

                if "alt" in image.attrib and image.attrib["alt"]:
                    image.tail = u" "
                    label = etree.SubElement(link, u"span")
                    label.text = image.attrib["alt"]
                    del image.attrib["alt"]
                else:
                    image.tail = None
                    image.set(u"alt", user)

    add_user_links(fragment, None, True)

    defang(fragment)

    return fragment
Exemplo n.º 59
0
def get_data(*args, **kwargs):
    url = 'http://www.fundamentus.com.br/resultado.php'
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(
        urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [
        ('User-agent',
         'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
        ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')
    ]

    # Aqui estão os parâmetros de busca das ações
    # Estão em branco para que retorne todas as disponíveis
    data = {
        'pl_min': '',
        'pl_max': '',
        'pvp_min': '',
        'pvp_max': '',
        'psr_min': '',
        'psr_max': '',
        'divy_min': '',
        'divy_max': '',
        'pativos_min': '',
        'pativos_max': '',
        'pcapgiro_min': '',
        'pcapgiro_max': '',
        'pebit_min': '',
        'pebit_max': '',
        'fgrah_min': '',
        'fgrah_max': '',
        'firma_ebit_min': '',
        'firma_ebit_max': '',
        'margemebit_min': '',
        'margemebit_max': '',
        'margemliq_min': '',
        'margemliq_max': '',
        'liqcorr_min': '',
        'liqcorr_max': '',
        'roic_min': '',
        'roic_max': '',
        'roe_min': '',
        'roe_max': '',
        'liq_min': '',
        'liq_max': '',
        'patrim_min': '',
        'patrim_max': '',
        'divbruta_min': '',
        'divbruta_max': '',
        'tx_cresc_rec_min': '',
        'tx_cresc_rec_max': '',
        'setor': '',
        'negociada': 'ON',
        'ordem': '1',
        'x': '28',
        'y': '16'
    }

    with opener.open(url,
                     urllib.parse.urlencode(data).encode('UTF-8')) as link:
        content = link.read().decode('ISO-8859-1')

    pattern = re.compile('<table id="resultado".*</table>', re.DOTALL)
    reg = re.findall(pattern, content)[0]
    page = fragment_fromstring(reg)
    lista = OrderedDict()

    stocks = page.xpath('tbody')[0].findall("tr")

    for i in range(0, len(stocks)):
        lista[i] = {
            stocks[i].getchildren()[0][0].getchildren()[0].text: {
                'cotacao': stocks[i].getchildren()[1].text,
                'P/L': stocks[i].getchildren()[2].text,
                'P/VP': stocks[i].getchildren()[3].text,
                'PSR': stocks[i].getchildren()[4].text,
                'DY': stocks[i].getchildren()[5].text,
                'P/Ativo': stocks[i].getchildren()[6].text,
                'P/Cap.Giro': stocks[i].getchildren()[7].text,
                'P/EBIT': stocks[i].getchildren()[8].text,
                'P/Ativ.Circ.Liq.': stocks[i].getchildren()[9].text,
                'EV/EBIT': stocks[i].getchildren()[10].text,
                'EBITDA': stocks[i].getchildren()[11].text,
                'Mrg. Ebit': stocks[i].getchildren()[12].text,
                'Mrg.Liq.': stocks[i].getchildren()[13].text,
                'Liq.Corr.': stocks[i].getchildren()[14].text,
                'ROIC': stocks[i].getchildren()[15].text,
                'ROE': stocks[i].getchildren()[16].text,
                'Liq.2m.': stocks[i].getchildren()[17].text,
                'Pat.Liq': stocks[i].getchildren()[18].text,
                'Div.Brut/Pat.': stocks[i].getchildren()[19].text,
                'Cresc.5a': stocks[i].getchildren()[20].text
            }
        }

    return lista
Exemplo n.º 60
0
    def tree(self):
        """Wrap the HTML as a Scrapy selector.

        Returns: Selector
        """
        return html.fragment_fromstring(self.html)