Пример #1
0
 def test_element(self):
     # Assert Element properties (test <body>).
     v = web.Document(self.html).body
     self.assertEqual(v.tag, "body")
     self.assertEqual(v.attributes["id"], "front")
     self.assertEqual(v.attributes["class"], "comments")
     self.assertTrue(v.content.startswith("\n<script"))
     # Assert Element.getElementsByTagname() (test navigation links).
     a = v.by_tag("a")
     self.assertEqual(len(a), 3)
     self.assertEqual(a[0].content, "nav1")
     self.assertEqual(a[1].content, "nav2")
     self.assertEqual(a[2].content, "nav3")
     # Assert Element.getElementsByClassname() (test <p class="comment">).
     a = v.by_class("comment")
     self.assertEqual(a[0].tag, "p")
     self.assertEqual(a[0].by_tag("span")[0].attributes["class"], "date")
     self.assertEqual(a[0].by_tag("span")[1].attributes["class"], "author")
     for selector in (".comment", "p.comment", "*.comment"):
         self.assertEqual(v.by_tag(selector)[0], a[0])
     # Assert Element.getElementById() (test <div id="content">).
     e = v.by_id("content")
     self.assertEqual(e.tag, "div")
     self.assertEqual(e, a[0].parent)
     for selector in ("#content", "div#content", "*#content"):
         self.assertEqual(v.by_tag(selector)[0], e)
     # Assert Element.getElementByAttribute() (test on <a href="">).
     a = v.by_attribute(href="nav1.html")
     self.assertEqual(a[0].content, "nav1")
     print "pattern.web.Node.Element"
     print "pattern.web.Node.Element.by_tag()"
     print "pattern.web.Node.Element.by_class()"
     print "pattern.web.Node.Element.by_id()"
     print "pattern.web.Node.Element.by_attribute()"
Пример #2
0
    def test_node_traverse(self):
        # Assert Node.traverse() (must visit all child nodes recursively).
        self.b = False

        def visit(node):
            if node.type == web.ELEMENT and node.tag == "span":
                self.b = True

        v = web.Document(self.html)
        v.traverse(visit)
        self.assertEqual(self.b, True)
        print "pattern.web.Node.traverse()"
Пример #3
0
 def test_node_document(self):
     # Assert Node properties.
     v1 = web.Document(self.html)
     self.assertEqual(v1.type, web.DOCUMENT)
     self.assertEqual(v1.source[:10], "<!doctype ") # Note: BeautifulSoup strips whitespace.
     self.assertEqual(v1.parent, None)
     # Assert Node traversal.
     v2 = v1.children[0].next
     self.assertEqual(v2.type, web.TEXT)
     self.assertEqual(v2.previous, v1.children[0])
     # Assert Document properties.
     v3 = v1.declaration
     self.assertEqual(v3, v1.children[0])
     self.assertEqual(v3.parent, v1)
     self.assertEqual(v3.source, "<!doctype html>")
     self.assertEqual(v1.head.type, web.ELEMENT)
     self.assertEqual(v1.body.type, web.ELEMENT)
     self.assertTrue(v1.head.source.startswith("<head"))
     self.assertTrue(v1.body.source.startswith("<body"))
     print "pattern.web.Node"
     print "pattern.web.DOM"
Пример #4
0
def extract_data(package):
    (page, query) = package

    print "Checking %s" % page
    new_webpage = Webpage()
    new_webpage.url = page
    try:
        url = web.URL(page)
        mimetype = url.mimetype
        new_webpage.mimetype = mimetype
        print "Checking mimetype..."
        if mimetype == 'text/html':
            print "Mimetype ok (text/html)"  #only load Webpages!!!
            domain = url.domain  # u'domain.com'
            url_feed = ''
            redirected_page = url.redirect  # Actual URL after redirection, or None.
            path = url.path  # [u'path']
            # different options to open a webpage
            print "Opening %s" % page
            html = url.download(user_agent=choice(user_agents), cached=False)
            #html = urllib2.urlopen(page).read()
        else:
            print 'Wrong mimetype (not text/html)'
            new_webpage.successful_open = True
    except:
        print "Could not open page: %s" % page
        new_webpage.successful_open = False
    try:
        if check_query(query, str(
                html)):  #on s'assure d'abord que ça roule pour le full html
            new_webpage.successful_open = True

            dom = web.Document(html)
            try:
                title = dom.by_tag('title')[0]
                title = repr(web.plaintext(title.content))
                print "Setting page title to %s" % title
            except:
                print "No title found for %s" % page
                title = ''

            #two methods for charset detection:
            charset = None
            # option to detect page encoding from dom structure => does not seem to work utf-8 systematically retrieved...???
            # try:
            # 	metas=dom.by_tag('meta')
            # 	charset=looking4charset(metas)
            # 	print 'charset',charset, 'in page',page
            # except:
            # 	charset=None
            #

            # chardet library use
            # if charset==None:
            # 	encoding = chardet.detect(html)
            # 	html=html.decode(encoding['encoding'])
            # else:
            # 	html=html.decode(charset)

            query_result, text_summary, html_summary = check_page_against_query(
                html, title, query)
            # charset guess can be used to decode results
            # if charset==None:
            # 	encoding = chardet.detect(html)
            # 	html=html.decode(encoding['encoding'])
            # else:
            # 	html=html.decode(charset)

            #save in a repertory output textual summaries
            #fileout=open('temp/'+page[7:20]+'.htm','w')
            #print 'temp/'+page+'.htm'
            #fileout.write(html_summary)
            #fileout.close()

            #if query_result:
            # dom = web.Document(html_summary)
            # try:
            # 	date = dom.by_tag('date')[0]
            # 	date = repr(plaintext(date.content))
            # except:
            # 	date=''
            # print '######date',date
            dateregexp = re.compile(r'(\d{4})-|\\(\d{2})-|\\(\d{2})')

            date = ''
            if not redirected_page == None:
                print 'plus redirection: ', redirected_page
                try:
                    date = dateregexp.search(redirected_page).groups()
                    new_webpage.date = '-'.join(date)
                except:
                    pass
            else:
                try:
                    date = dateregexp.search(page).groups()
                    new_webpage.date = '-'.join(date)
                except:
                    pass
            #print '#############date',date

            if date == '':
                date_txt = pattern_date_fr.search(str(text_summary))
                if not date_txt == None:
                    date = date_txt.groups()
                    new_webpage.date = '-'.join(date)
            #date_txt=pattern_date_fr.search("Samedi 6 août 2011606/08/Août/201120:29")
            if query_result:
                try:
                    print 'page: ', new_webpage.url, ' with title: ', title, ' and date', new_webpage.date, 'was assessed as ', query_result
                except:
                    pass
            #print 'date_txt'
            #print 'date_txt:',str(date_txt)
            #feed webpage details with informations
            new_webpage.url_redirected = redirected_page
            new_webpage.html = html
            new_webpage.html_summary = html_summary
            new_webpage.text_summary = text_summary
            new_webpage.domain = domain
            new_webpage.query_result = query_result
            new_webpage.url_feed = url_feed
            new_webpage.path = path
            new_webpage.charset = charset
            new_webpage.title = title
            new_webpage.opened = new_webpage.opened + 1
            new_webpage.md5 = hashlib.sha224(text_summary).hexdigest()
            new_webpage.text_html = web.plaintext(html,
                                                  keep=[],
                                                  replace=web.blocks,
                                                  linebreaks=2,
                                                  indentation=False)
            #new_webpage.display_page()
            #new_webpage.links=None
        else:
            #the query is not even in the raw html
            new_webpage.successful_open = True
            new_webpage.query_result = False
    except:
        #print "*** Could not extract data from %s" % page
        pass
    return new_webpage