def parsegeneral(self,path): #creating an object parser which helps in retrieving the xml parsing when it encounters an error parser = etree.XMLParser(recover=True) tree=etree.parse(path,parser=parser) #saving the root node in root variable root = tree.getroot() #saving the content of context node in child list child = root.findall('.//context') data =[] for index in range(len(child)): data += etree.tostringlist(child[index],encoding="us-ascii",method="xml") # removing context tags form beginning and end # and tokenizing the sentences into words from # which the stop words and punctuations # will be remove later. t=[] for x in range(len(data)): s=str(data[x]) s=s[13:len(s)-16] t.append(nltk.word_tokenize(s)) # removing stop words and punctuation from the # words in the sentences stop=set(stopwords.words('english')) stop.update(['head','Mr','I','In','Mrs','A','So','To','us','He','And','Yes']) for x in range(len(t)): # removing stop words t[x]=[word for word in t[x] if word not in stop] # removing not alphabet words i.e punctuations etc. t[x]=[word for word in t[x] if self.isalfa(word)] return t,data
def process_xpaths(forest: dict, xpaths: dict): news = {} for new in xpaths.keys(): pages = [] book = forest[new].xpath(xpaths[new] + '//h3') for page in book: pages.append( etree.tostringlist(page, encoding='unicode', method='text')[0]) news[new] = pages return news
def setup_project(product): global confs confs = {'product': product} log.info("Creating project to %(product)s product." % confs) android = configspl.sdkdir + "/tools/android " # These acitivity and package params will be rewrite params = "create project --target 1 --name %(product)s \ --path ../%(product)s --activity MainActivity \ --package br.ufrn.dimap.%(product)s" % confs log.info(android + params) os.system(android + params) shutil.rmtree("../%(product)s/src/br/ufrn" % confs) files = ['.classpath', '.project', 'AndroidManifest.xml'] for f in files: params = {'file': f, 'product': confs['product']} copiar = 'cp ./%(file)s ../%(product)s/' % params os.system(copiar) # Changes in .project file tree = etree.parse("../%(product)s/.project" % confs) name = tree.find('name') name.text = confs['product'] file1 = open("../%(product)s/.project" % confs, 'wb') xml = [b'<?xml version="1.0" encoding="UTF-8"?>\n'] + etree.tostringlist(tree) file1.writelines(xml) file1.close() # Changes in AndroidManifest.xml file tree = etree.parse("../%(product)s/AndroidManifest.xml" % confs) root = tree.getroot() root.attrib['package'] += '.' + confs['product'] file1 = open("../%(product)s/AndroidManifest.xml" % confs, 'wb') xml = [b'<?xml version="1.0" encoding="UTF-8"?>\n'] + etree.tostringlist(tree) file1.writelines(xml) file1.close()
def extract_mail(path, suffix): tree_root = etree.Element("meta") if suffix == "": object = mailbox.mbox(path) mbox = etree.SubElement(tree_root, "mbox") # if suffix == ".mbs": # object = mailbox.MaildirMessage(path) # mbox = etree.SubElement(tree_root, "maildir") for message in object: subject = message['subject'] sender = message['from'] receiver = message['to'] date = message['date'] text = message['message'] msg = etree.SubElement(mbox, "msg") etree.SubElement(msg, "to").text = receiver etree.SubElement(msg, "from").text = sender etree.SubElement(msg, "subject").text = subject etree.SubElement(msg, "date").text = date etree.SubElement(msg, "message").text = text tree = etree.tostringlist(tree_root, pretty_print=True) return tree
<li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0"><a href="link5.html">a属性</a> </ul> </div> ''' # #初始化xpath对象 # html = etree.HTML(text) # #解析对象输出代码 # result = etree.tostring(html,encoding='utf-8') #etree.parse是一个解析器,修复HTML中缺少的元素 html = etree.parse(r'C:\Users\dell\Desktop\xml.html', etree.HTMLParser()) #将html中的元素解析成字节 result = etree.tostring(html) #将html中的元素解析成列表 result1 = etree.tostringlist(html) #打印文本中所有的节点元素 element = html.xpath('//*') print(type(html)) print(type(result)) #打印经过处理后的代码 #etree会自动修复缺少的的文本节点 print(result.decode('utf-8')) print(result1) print(element) print('-' * 20 + '这是分隔符' + '-' * 20) #!/usr/bin/env python #coding :utf-8
from lxml import etree page = etree.Element('table') doc = etree.ElementTree(page) headElt = etree.SubElement(page, 'tbody') #Header tr = etree.SubElement(headElt, 'tr') th = etree.SubElement(tr, 'th') th.text = 'Type' th = etree.SubElement(tr, 'th') th.text = 'Server' #Body tr = etree.SubElement(headElt, 'tr') td = etree.SubElement(tr, 'td') td.text = 'Element1' outFile = open('homemade2.xml', 'w') #doc.write(outFile) #doc.write(outFile,pretty_print=True, xml_declaration=True, encoding="utf-8") print etree.dump(page) #outFile.seek(10) list = etree.tostringlist(page) for item in list: outFile.write("%s\n" % item) #outFile.write("\n".join(list).join("\n"))
from lxml import etree import csv from lxml.builder import E def TYPE(*args): return {"type": ' '.join(args)} def STATUT(*args): return {"statut": ' '.join(args)} with open('sanisettesparis.csv', newline='') as csvfile: spamreader = csv.reader(csvfile, delimiter=';', quotechar='|') xmlFile = 'toilettes-paris-librairieLXML.xml' xmlData = open(xmlFile, 'w') xmlData.write('<?xml version="1.0"?>' + "\n") xmlData.write('<!DOCTYPE toilettes SYSTEM "wc.dtd">' + "\n") xmlData.write('<toilettes>' + "\n") for row in spamreader: xml = page = (E.toilette( TYPE(row[0]), STATUT(row[1]), E.adresse(E.libelle(row[2]), E.arrondissement(row[3])), E.horaire(row[4]), E.services(E.acces_pmr(row[5]), E.relais_bebe(row[6])), E.equipement(row[7]))) xmlData.write(str(etree.tostringlist(page, pretty_print=False))) xmlData.write('</toilettes>' + "\n") xmlData.close()
def get_entity_list_xml( entity_dict, nsmap, root_element_tag, sub_element_tag, root_prefix='None', ): """ Get entity list in xml format :params: entity_dict with root key as entity name. The value is an array of entity dictionaries which each containing entity attributes as keys and a separate 'links' key/value pair. The value of which is an array of dictionaries containing hyperlinks with relations to the entity in each dictionary. An example entity_dict is shown below: entity_dict = { 'vmhosts': [{ "id": 'host-1234', "name": 'newhost', "links": [ { "rel": "self", "href": 'http://localhost:8774/v2/admin/vmhosts' }, { "rel": "bookmark", "href": 'http://localhost:8774/admin/vmhosts' } ], }], "vmhosts_links": [ { "rel": "next", "href": 'http://localhost:8774/v2/admin/vmhosts&marker=4" } ]} :params nsmap: namespace map to be used for the generated xml. :params root_element_tag: element tag of the root element. :params sub_element_tag: element tag for each sub element. i.e for each entity dictionary. :params root_prefix: root prefix to be used for identifying the namespace of the document from the nsmap. :returns: list of entities in xml format using the entity dictionary. :raises LookupError: If there is more than one root(key) element in the entity_dict. """ if not entity_dict: return '' # TODO(siva): add check for entities_links keys = entity_dict.keys() root_key = '' if len(keys) > 2: raise LookupError('More than one root element in entity') page_links = [] if len(keys) == 2: if keys[0].endswith("_links"): page_links = entity_dict[keys[0]] root_key = keys[1] elif keys[1].endswith("_links"): root_key = keys[0] page_links = entity_dict[keys[1]] else: raise LookupError('More than one root element in entity') else: root_key = entity_dict.keys()[0] root_namespace = '' if nsmap is not None and root_prefix in nsmap: root_namespace = '{%s}' % nsmap[root_prefix] root = Element(root_namespace + root_element_tag, nsmap=nsmap) dict_list = entity_dict[root_key] for ent in dict_list: if not ent: continue link_list = [] if 'links' in ent: link_list = ent['links'] del ent['links'] attrib = {} for (key, val) in ent.items(): if key is not None: if val is not None: attrib[key] = val else: attrib[key] = '' entity_sub = SubElement(root, root_namespace + sub_element_tag, attrib) for link in link_list: SubElement(entity_sub, constants.ATOM + 'link', link) for link in page_links: SubElement(root, constants.ATOM + 'link', link) return etree.tostringlist(root)[0]
logger = logging.getLogger('session_test') logger.critical("Started") switch1 = Switch("172.16.1.166", port="8443") switch1.set_variable('uid', 'cisco') switch1.set_variable('pwd', 'cisco') print(switch1) resp = switch1.run_commands(data=['show version'], format='json') time.sleep(1) pprint(resp) resp = switch1.run_commands(data=['show version'], format='xml') time.sleep(1) ET.dump(resp) pprint(ET.tostringlist(resp)) pprint(resp.findall(".//kickstart_ver_str")[0].text) resp = switch1.run_commands(data=['show vrf', 'show ip arp', 'show mac address-table'], format='xml') time.sleep(1) ET.dump(resp) print(ET.tostringlist(resp)) print() print("=" * 80) print("Printing vrf") print(ET.tostring(switch1.run_commands(['show vrf'], transport='xss', port=22), encoding='unicode', pretty_print=True)) print("Printing arp") print(ET.tostring(switch1.run_commands(['show ip arp'], transport='xss', port=22), encoding='unicode', pretty_print=True)) print("Print macs") print(ET.tostring(switch1.run_commands(['show mac address-table'], transport='xss'), encoding='unicode', pretty_print=True))
w = 1 if int(item.tag[1:])==lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text=th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text=th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) elif item.tag[0] == 'F': w = 2 if int(item.tag[1:])==lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text=th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text=th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) elif item.tag[0] == 'G': w = 3 if int(item.tag[1:])==lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text=th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text=th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) continue mjr.writelines(ET.tostringlist(root,encoding="utf-8",pretty_print=True)) mjr.close
def save(dbcon, filename): ''' Write contents of plexos sqlite database to xml filename Args: dbcon - sqlite database connection filename - Location to save plexos XML file. The file will be overwritten if it exists No Return ''' # TODO: Check for overwrite existing xml # Get list of objects with objname dbcon.row_factory = sql.Row cur = dbcon.cursor() cur.execute("SELECT name FROM sqlite_master WHERE type='table'") tables = [t[0] for t in cur.fetchall()] with codecs.open(filename, "w", "utf-8-sig") as fout: # file writing in Python3 is different than 2, have to convert # strings to bytes or open the file with an encoding. There is no # easy write for all data types plexos_meta = {} try: cur.execute("SELECT name, value FROM '%s'" % (META_TABLE)) except sql.Error: LOGGER.warning("No metadata found in table %s", META_TABLE) plexos_meta['namespace'] = "http://tempuri.org/MasterDataSet.xsd" plexos_meta['root_element'] = "MasterDataSet" else: for row in cur.fetchall(): plexos_meta[row[0]] = row[1] fout.write('<%s xmlns="%s">\r\n' % (plexos_meta['root_element'], plexos_meta['namespace'])) for table_name in sorted(tables): if table_name == META_TABLE: continue try: cur.execute("SELECT * FROM '%s'" % (table_name)) except sql.Error: LOGGER.warning("Bad table %s", table_name) continue row_keys = [k[0] for k in cur.description] #cElementTree has no pretty print, so some convolution is needed row = cur.fetchone() while row is not None: fout.write(' ') ele = etree.Element('t_' + table_name) for (sube, val) in zip(row_keys, row): # Uncommenting the following will ignore subelements with no values # Sometimes missing subelements with no values were crashing plexos. # See issue #54 if val is None: continue attr_ele = etree.SubElement(ele, sube) if isinstance(val, int): val = str(val) attr_ele.text = val ele_slist = etree.tostringlist(ele) # This is done because in python2, to_string prepends the string with an # xml declaration. Also in python2, the base class of 'bytes' is basestring # TODO: Will this ever process an element with no data? if isinstance(ele_slist[0], str): ele_s = "".join(ele_slist) else: # Python3 bytes object ele_s = "" for byte_list in ele_slist: ele_s += byte_list.decode('UTF-8') fout.write( ele_s.replace('><', '>\r\n <').replace(' </t_', '</t_')) fout.write('\r\n') row = cur.fetchone() fout.write('</%s>\r\n' % plexos_meta['root_element'])
def get_entity_list_xml( entity_dict, nsmap, root_element_tag, sub_element_tag, root_prefix='None', ): """ Get entity list in xml format :params: entity_dict with root key as entity name. The value is an array of entity dictionaries which each containing entity attributes as keys and a separate 'links' key/value pair. The value of which is an array of dictionaries containing hyperlinks with relations to the entity in each dictionary. An example entity_dict is shown below: entity_dict = { 'vmhosts': [{ "id": 'host-1234', "name": 'newhost', "links": [ { "rel": "self", "href": 'http://localhost:8774/v2/admin/vmhosts' }, { "rel": "bookmark", "href": 'http://localhost:8774/admin/vmhosts' } ], }], "vmhosts_links": [ { "rel": "next", "href": 'http://localhost:8774/v2/admin/vmhosts&marker=4" } ]} :params nsmap: namespace map to be used for the generated xml. :params root_element_tag: element tag of the root element. :params sub_element_tag: element tag for each sub element. i.e for each entity dictionary. :params root_prefix: root prefix to be used for identifying the namespace of the document from the nsmap. :returns: list of entities in xml format using the entity dictionary. :raises LookupError: If there is more than one root(key) element in the entity_dict. """ if not entity_dict: return '' #TODO(siva): add check for entities_links keys = entity_dict.keys() root_key = '' if len(keys) > 2: raise LookupError('More than one root element in entity') page_links = [] if len(keys) == 2: if keys[0].endswith("_links"): page_links = entity_dict[keys[0]] root_key = keys[1] elif keys[1].endswith("_links"): root_key = keys[0] page_links = entity_dict[keys[1]] else: raise LookupError('More than one root element in entity') else: root_key = entity_dict.keys()[0] root_namespace = '' if nsmap is not None and root_prefix in nsmap: root_namespace = '{%s}' % nsmap[root_prefix] root = Element(root_namespace + root_element_tag, nsmap=nsmap) dict_list = entity_dict[root_key] for ent in dict_list: if not ent: continue link_list = [] if 'links' in ent: link_list = ent['links'] del ent['links'] attrib = {} for (key, val) in ent.items(): if key is not None: if val is not None: attrib[key] = val else: attrib[key] = '' entity_sub = SubElement(root, root_namespace + sub_element_tag, attrib) for link in link_list: SubElement(entity_sub, constants.ATOM + 'link', link) for link in page_links: SubElement(root, constants.ATOM + 'link', link) return etree.tostringlist(root)[0]
element.getparent().replace(element, replace_element) for (element, replace_element, out_dict) in \ elements_to_be_replaced: LOG.debug( _('Replaced element path: %s' % replace_element.getroottree().getpath( replace_element))) replace_dict_out.update( {tree.getpath(replace_element): out_dict}) except (KeyError, IndexError, ValueError), err: LOG.error( _('Lookup Error while finding tag healthnmon api... %s ' % str(err)), exc_info=1) return etree.tostringlist(tree.getroot())[0] def dump_resource_xml(resource_obj, tag): """Serialize object using resource model """ LOG.debug(_('Exporting tag: %s as xml...' % tag)) xml_out_file = StringIO.StringIO() resource_obj.export(xml_out_file, 0, name_=tag) return xml_out_file.getvalue() def get_project_context(req): """ Get project context from request :param req: request object from which context would be fetched. :returns: project context tuple (context, project_id)
def print_tree(self): print(etree.tostringlist(self.tree, pretty_print=True))
tree._setroot(replace_element) else: element.getparent().replace(element, replace_element) for (element, replace_element, out_dict) in \ elements_to_be_replaced: LOG.debug(_('Replaced element path: %s' % replace_element.getroottree().getpath( replace_element))) replace_dict_out.update( {tree.getpath(replace_element): out_dict}) except (KeyError, IndexError, ValueError), err: LOG.error(_('Lookup Error while finding tag \ healthnmon api... %s ' % str(err)), exc_info=1) return etree.tostringlist(tree.getroot())[0] def dump_resource_xml(resource_obj, tag): """Serialize object using resource model """ LOG.debug(_('Exporting tag: %s as xml...' % tag)) xml_out_file = StringIO.StringIO() resource_obj.export(xml_out_file, 0, name_=tag) return xml_out_file.getvalue() def get_project_context(req): """ Get project context from request :param req: request object from which context would be fetched. :returns: project context tuple (context, project_id)
elif item.tag[0] == 'E': w = 1 if int(item.tag[1:]) == lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text = th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text = th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) elif item.tag[0] == 'F': w = 2 if int(item.tag[1:]) == lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text = th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text = th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) elif item.tag[0] == 'G': w = 3 if int(item.tag[1:]) == lasthai: discard2 = ET.SubElement(game, "discard2") discard2.text = th2txt(int(item.tag[1:])) else: discard1 = ET.SubElement(game, "discard1") discard1.text = th2txt(int(item.tag[1:])) lasthai = int(item.tag[1:]) continue mjr.writelines(ET.tostringlist(root, encoding="utf-8", pretty_print=True)) mjr.close
for new in xpaths.keys(): pages = [] book = forest[new].xpath(xpaths[new] + '//h3') for page in book: pages.append( etree.tostringlist(page, encoding='unicode', method='text')[0]) news[new] = pages return news def get_news(urls={}, xpaths={}): if not urls: urls = DEFAULT_URLS if not xpaths: xpaths = DEFAULT_NEWS_XPATHS garden = forest(urls) return process_xpaths(garden, xpaths) if __name__ == '__main__': # news = get_news() tree = tree_from_html('https://cointelegraph.com/') data = tree.xpath('//*[@id="js-main-slideshow-pager"]//h3') news = [] for new in data: news.append( etree.tostringlist(new, encoding='unicode', method='text')[0]) breaking_news = get_news()