Python parse示例，libxml2dom.parse Python示例

示例#1

0

显示文件

def parseFile(filename, html=True):
    flush_print("Parsing: %s" % filename)
    fileHandle = open(filename)
    fileDoc = libxml2dom.parse(fileHandle)
    fileHandle.close()
    #flush_print("Parsed: %s" % filename)
    return fileDoc

示例#2

0

显示文件

def parse(stream_or_string,
          html=0,
          htmlencoding=None,
          unfinished=0,
          impl=None):
    return libxml2dom.parse(stream_or_string,
                            html=html,
                            htmlencoding=htmlencoding,
                            unfinished=unfinished,
                            impl=(impl or default_impl))

示例#3

0

显示文件

def parse(stream_or_string,
          html=0,
          htmlencoding=None,
          unfinished=0,
          impl=None):
    doc = libxml2dom.parse(stream_or_string,
                           html=html,
                           htmlencoding=htmlencoding,
                           unfinished=unfinished,
                           impl=(impl or default_impl))
    initialiseEvents(doc)
    return doc

示例#4

0

显示文件

文件： createproof.py 项目： iamjabour/smurf

def create_proof(url, parse=None,	out=None):
    """
    Create a HTML document with new attributes to provide a proof to benchmarks and 
    some other functions executed by this framework.
    The basic annotations are: 'proof_productlist' and 'proof_product'.
    """

    if not parse:
        parse = Path()

    # use libxml2 to parse the HTML document
    doc = libxml2dom.parse('%s' %url, html=1, unfinished=1, htmlencoding='latin1')
    L = []

    # check if the parse can find a list of products
    productList = parse.plist(doc)

    if len(productList) == 1:
        node = productList[0]
        if debug:
            print 'found a product list!', node.localName
    elif debug and len(productList) > 1:
        if debug:
            print 'found more then one product list!!!', node.localName
    else:
        print '\nERROR: Cannot found a list of products using corrent xpath'
        return


    products = parse.products(doc)
    lastline = parse.last(doc)

    if lastline != None:
        [products.append(p) for p in lastline]

    if debug:
        print len(products)

    for pl in productList:
        pl.createAttribute('proof_productlist')
        pl.setAttribute('proof_productlist', 'true')

    for product in products:
        product.createAttribute('proof_product')
        product.setAttribute('proof_product', 'true')


    if out != None:
        print >>out, doc.toString()

示例#5

0

显示文件

def getEd2kLinks(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    req = urllib2.Request(
        url='http://secure.verycd.com/signin/*/http://www.verycd.com/',
        headers=headers)

    webPage = urllib2.urlopen(url)
    try:
        doc = libxml2dom.parse(webPage, 1, 'utf-8')
    finally:
        webPage.close()
    return [getEd2kLinkFromDownloadBtn(doc), getEd2kLinkFromSubtitle(doc)]

示例#6

0

显示文件

文件： ed2k.py 项目： sundayu/tools

def getEd2kLinks( url ):
    headers = {
         'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    req = urllib2.Request(
        url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/',
        headers = headers
    )

    webPage = urllib2.urlopen(url)
    try:
        doc = libxml2dom.parse(webPage, 1, 'utf-8')
    finally:
        webPage.close()
    return [ getEd2kLinkFromDownloadBtn(doc), getEd2kLinkFromSubtitle(doc)]

示例#7

0

显示文件

文件： views.py 项目： m-khl/djadja

def handleAsk(request):
    formData = {"roomsFr" : request.POST.getlist('roomsFr')[0],
                                  "roomsTo" : request.POST.getlist('roomsTo')[0],
                                  "priceFr" : request.POST.getlist('priceFr')[0],
                                  "priceTo" : request.POST.getlist('priceTo')[0],
                                  "metros": "&".join([ "metro%5B%5D={0}".format(code) for code in request.POST.getlist('stations')]) }
    
    url = 'http://www.bn.ru/zap_fl.phtml?kkv1={roomsFr}&kkv2={roomsTo}&price1={priceFr}&price2={priceTo}&so1=&so2=&sk1=&sk2=&type%5B%5D=1&type%5B%5D=3&sorttype=0&sort_ord=0&{metros}&text='.format(**formData)
    formData["url"]=url
    req = urllib2.Request(url, headers={'User-Agent' : "Mozilla Firefox"}) 
    f = urllib2.urlopen(req) 
    doc = libxml2dom.parse(f, html=1)
    trs=doc.xpath('//table[@class="results"]/tr')[3:]
    
    formData["trs"] = [ [ td.textContent for td in tds] for tds in 
                                      [tr.getElementsByTagName('td') for tr in trs] ]
    formData["stations"] = [[code, text, request.POST.getlist('stations').count(code)>0] for code, text in settings.SUBWAYS]
    
    return formData

示例#8

0

显示文件

文件： Blog2Wikiloc.py 项目： kokomero/blogtools

def main(argv):
  #Extract arguments
  try:
    opts, args = getopt.getopt(argv, "hu:", ["help", "url="])
  except getopt.GetoptError:
    usage()
    sys.exit(2)

  #Check we got some arguments
  if len( opts ) == 0:
    usage()
    sys.exit(2)

  #Parse command line arguments
  for opt, arg in opts:
    if opt in ("-h", "--help"):
      print_help()
      sys.exit()
    elif opt in ("-u", "--url"):
      url = arg

  #Open the HTML documment from blogger
  document = libxml2dom.parse(url, html=1)

  #Create the XPath expression to look for the entry content
  xpression = "//div[@class='post-body entry-content']//span"
  nodes = document.xpath( xpression )
  
  #First print a href link to the blog post
  print '<a href="' + url + '"> Link to the blog post - Enlace a la pagina en el blog</a>'
  
  #For each node in the post content, check whether it is an img or plain textContent
  for i in nodes:
    #If img node
    if ( len( i.getElementsByTagName("img") ) > 0):
      print (i.getElementsByTagName("img")[0].toString())
      print ("<br />")
    else:
      print (i.textContent)

  #Exit
  sys.exit()

示例#9

0

显示文件

def main(argv):
    #Extract arguments
    try:
        opts, args = getopt.getopt(argv, "hu:", ["help", "url="])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    #Check we got some arguments
    if len(opts) == 0:
        usage()
        sys.exit(2)

    #Parse command line arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print_help()
            sys.exit()
        elif opt in ("-u", "--url"):
            url = arg

    #Open the HTML documment from blogger
    document = libxml2dom.parse(url, html=1)

    #Create the XPath expression to look for the entry content
    xpression = "//div[@class='post-body entry-content']//span"
    nodes = document.xpath(xpression)

    #First print a href link to the blog post
    print '<a href="' + url + '"> Link to the blog post - Enlace a la pagina en el blog</a>'

    #For each node in the post content, check whether it is an img or plain textContent
    for i in nodes:
        #If img node
        if (len(i.getElementsByTagName("img")) > 0):
            print(i.getElementsByTagName("img")[0].toString())
            print("<br />")
        else:
            print(i.textContent)

    #Exit
    sys.exit()

示例#10

0

显示文件

文件： xmlrpc.py 项目： juanchitot/jaimeboot

def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None):
    return libxml2dom.parse(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl))

示例#11

0

显示文件

文件： svg.py 项目： kp7/plan

def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None):
    doc = libxml2dom.parse(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl))
    initialiseEvents(doc)
    return doc

示例#12

0

显示文件

文件： begat.py 项目： juanchitot/jaimeboot

    use_libxml2macro = "libxml2macro" in sys.argv
    iterations = [int(arg.split("-")[0]) for arg in sys.argv if arg.endswith("-times")]

    if len(ot_locations) == 0:
        print "Please specify the location of the ot.xml file."
        sys.exit(1)

    if len(iterations) == 0:
        iterations = 1
    else:
        iterations = iterations[0]

    raw_input("Start your engines with ps -p %s -fv" % os.getpid())
    t = time.time()

    for i in range(0, iterations):
        if use_libxml2macro:
            n_doc = parseFile(ot_locations[0])
            l = test_begat_libxml2macro(n_doc, full_xpath)
        else: # use_libxml2dom:
            import libxml2dom
            doc = libxml2dom.parse(ot_locations[0])
            l = test_begat_libxml2dom(doc, full_xpath)

    print "Time taken", time.time() - t
    raw_input("Stop your engines!")

    print l

# vim: tabstop=4 expandtab shiftwidth=4

示例#13

0

显示文件

文件： namespaces.py 项目： juanchitot/jaimeboot

document = libxml2dom.createDocument(None, "doc", None)
top = document.xpath("*")[0]
elem1 = document.createElementNS("DAV:", "href")
print "Namespace is", repr(elem1.namespaceURI)
document.replaceChild(elem1, top)
elem2 = document.createElementNS(None, "no_ns")
print "Namespace is", repr(elem2.namespaceURI)
document.xpath("*")[0].appendChild(elem2)
print "Find href", len(document.xpath("href")) != 0
print "Find x:href", len(document.xpath("x:href", namespaces={"x": "DAV:"})) != 0
print "Find //no_ns", len(document.xpath("//no_ns")) != 0
print "Find x:href/no_ns", len(document.xpath("x:href/no_ns", namespaces={"x": "DAV:"})) != 0
print document.toString()
document.toFile(open("test_ns.xml", "wb"))

document = libxml2dom.parse("test_ns.xml")
print "Namespace is", repr(document.xpath("*")[0].namespaceURI)
print "Namespace is", repr(document.xpath("*/*")[0].namespaceURI)
print "Find href", len(document.xpath("href")) != 0
print "Find x:href", len(document.xpath("x:href", namespaces={"x": "DAV:"})) != 0
print "Find //no_ns", len(document.xpath("//no_ns")) != 0
print "Find x:href/no_ns", len(document.xpath("x:href/no_ns", namespaces={"x": "DAV:"})) != 0
print document.toString()
print "--------"

print
print "This is minidom's behaviour for default namespaces:"
print
document = xml.dom.minidom.Document()
elem1 = document.createElementNS("DAV:", "href")
print "Namespace is", repr(elem1.namespaceURI)

示例#14

0

显示文件

文件： settings.py 项目： m-khl/djadja

#     'django.template.loaders.eggs.load_template_source',
)

MIDDLEWARE_CLASSES = (
    'django.middleware.common.CommonMiddleware',
    'django.contrib.sessions.middleware.SessionMiddleware',
    'django.contrib.auth.middleware.AuthenticationMiddleware',
)

ROOT_URLCONF = 'mysite.urls'

TEMPLATE_DIRS = (
"/home/mike/django/mysite"
    # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates".
    # Always use forward slashes, even on Windows.
    # Don't forget to use absolute paths, not relative paths.
)

INSTALLED_APPS = (
    'django.contrib.auth',
    'django.contrib.contenttypes',
    'django.contrib.sessions',
    'django.contrib.sites',
)

SUBWAYS = [[opt.getAttribute('value'),opt.textContent]
            for opt in libxml2dom.parse(
                            urllib2.urlopen(
                                     urllib2.Request('http://www.bn.ru/zap_fl_w.phtml', 
                                     headers={'User-Agent' : "Mozilla Firefox"})), html=1)
                       .xpath('//select[@id="metro"]/option')]

示例#15

0

显示文件

文件： test_valid_relaxng.py 项目： juanchitot/jaimeboot

#!/usr/bin/env python

import libxml2dom

schema = libxml2dom.parse("tests/test_valid_relaxng.xml")
d = libxml2dom.parse("tests/test_valid.xml")
print d.validate(schema)
print d.validateDocument(schema)
print d.getParameter("error-handler")

schema = libxml2dom.parse("tests/test_invalid_relaxng.xml")
d = libxml2dom.parse("tests/test_invalid.xml")
print d.validate(schema)
print d.validateDocument(schema)
print d.getParameter("error-handler")

# vim: tabstop=4 expandtab shiftwidth=4

示例#16

0

显示文件

文件： performance.py 项目： juanchitot/jaimeboot

        sys.exit(1)

    if sys.argv[2] == "libxml2macro":

        x2_d = parseFile(sys.argv[1])

        t = time.time()
        x2_d1, x2_d2 = test_import_libxml2macro(x2_d)
        toFile(x2_d2, "/tmp/xxx_libxml2macro.xml")
        print "Time", time.time() - t, "seconds"

    elif sys.argv[2] == "minidom":
        import xml.dom.minidom
        d = xml.dom.minidom.parse(sys.argv[1])

        t = time.time()
        d1, d2 = test_import_minidom(d)
        open("/tmp/xxx_minidom.xml", "wb").write(d2.toxml("utf-8"))
        print "Time", time.time() - t, "seconds"

    elif sys.argv[2] == "libxml2dom":
        import libxml2dom
        d = libxml2dom.parse(sys.argv[1])

        t = time.time()
        d1, d2 = test_import_libxml2dom(d)
        libxml2dom.toStream(d2, open("/tmp/xxx_libxml2dom.xml", "wb"))
        print "Time", time.time() - t, "seconds"

# vim: tabstop=4 expandtab shiftwidth=4

示例#17

0

显示文件

文件： test_valid_schematron.py 项目： juanchitot/jaimeboot

#!/usr/bin/env python

import libxml2dom

schema = libxml2dom.parse("tests/test_valid_schematron.xml")
d = libxml2dom.parse("tests/test_valid.xml")
print d.validate(schema)
print d.validateDocument(schema)
print d.getParameter("error-handler")

schema = libxml2dom.parse("tests/test_invalid_schematron.xml")
d = libxml2dom.parse("tests/test_invalid.xml")
print d.validate(schema)
print d.validateDocument(schema)
print d.getParameter("error-handler")

# vim: tabstop=4 expandtab shiftwidth=4