def xlsx(path): """ Returns a list of rows, where each row is a list of column values. """ import zipfile from xml.etree.ElementTree import iterparse a = [] r = {} v = "" z = zipfile.ZipFile(path) s = [e.text for x, e in iterparse(z.open("xl/sharedStrings.xml")) if e.tag.endswith("}t")] for x, e in iterparse(z.open("xl/worksheets/sheet1.xml")): if e.tag.endswith("}v"): # <v>84</v> v = e.text if e.tag.endswith("}c") \ and e.attrib.get("t"): # <c r="A3" t="s"><v>84</v></c> v = s[int(v)] if e.tag.endswith("}c"): c = e.attrib["r"] # AZ22 c = c.rstrip("0123456789") r[c], v = v, "" if e.tag.endswith("}row"): if any(r.values()): # skip empty rows a.append(r) r = {} m = max([max(r.keys()) for r in a]) for i, r in enumerate(a): # fill empty cells for c in CELLS.split(m)[0] + m: r.setdefault(c, "") a[i] = [r[c] for c in sorted(r)] return a
def xlsx(fname): import zipfile from xml.etree.ElementTree import iterparse zippy = zipfile.ZipFile(fname) try: words = [el.text for e, el in iterparse(zippy.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")] except: words = {} rows = [] row = {} val = "" for e, el in iterparse(zippy.open("xl/worksheets/sheet1.xml")): if el.tag.endswith("}v"): # <v>84</v> val = el.text if el.tag.endswith("}c"): # <c r="A3" t="s"><v>84</v></c> if el.attrib.get("t") == "s": val = words[int(val)] charac = el.attrib["r"] # AZ22 while charac[-1].isdigit(): charac = charac[:-1] row[charac] = val val = "" if el.tag.endswith("}row"): rows.append(row) row = {} return rows
def readXlsx(fileName, **args): # from: Hooshmand zandi http://stackoverflow.com/a/16544219 import zipfile from xml.etree.ElementTree import iterparse if "sheet" in args: sheet = args["sheet"] else: sheet = 1 if "header" in args: isHeader = args["header"] else: isHeader = False rows = [] row = {} header = {} z = zipfile.ZipFile(fileName) # Get shared strings strings = [el.text for e, el in iterparse(z.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")] value = "" # Open specified worksheet for e, el in iterparse(z.open("xl/worksheets/sheet%d.xml" % (sheet))): # get value or index to shared strings if el.tag.endswith("}v"): # <v>84</v> value = el.text if el.tag.endswith("}c"): # <c r="A3" t="s"><v>84</v></c> # If value is a shared string, use value as an index if el.attrib.get("t") == "s": value = strings[int(value)] # split the row/col information so that the row leter(s) can be separate letter = el.attrib["r"] # AZ22 while letter[-1].isdigit(): letter = letter[:-1] # if it is the first row, then create a header hash for the names # that COULD be used if rows == []: header[letter] = value.strip() else: if value != "": # if there is a header row, use the first row's names as the row hash index if isHeader == True and letter in header: row[header[letter]] = value else: row[letter] = value value = "" if el.tag.endswith("}row"): rows.append(row) row = {} z.close() return [header, rows]
def read_xlsx(file, **args): # type: (typing.Any, **typing.Any) -> typing.Tuple[typing.Dict[typing.Any, str], typing.List[typing.Dict[str, str]]] # from: Hooshmand zandi http://stackoverflow.com/a/16544219 import zipfile from xml.etree.ElementTree import iterparse sheet = args.get("sheet", 1) is_header = args.get("header", False) rows = [] # type: typing.List[typing.Dict[str, str]] row = {} header = {} z = zipfile.ZipFile(file) # Get shared strings strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t') ] # type: typing.List[str] value = '' # Open specified worksheet for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml' % sheet)): # get value or index to shared strings if el.tag.endswith('}v'): # <v>84</v> value = el.text if el.tag.endswith( '}c'): # <c r="A3" t="s"><v>84</v></c> # If value is a shared string, use value as an index if el.attrib.get('t') == 's': value = strings[int(value)] # split the row/col information so that the row letter(s) can be separate letter = el.attrib['r'] # type: str # AZ22 while letter[-1].isdigit(): letter = letter[:-1] # if it is the first row, then create a header hash for the names that COULD be used if not rows: header[letter] = value.strip() else: if value != '': # if there is a header row, use the first row's names as the row hash index if is_header is True and letter in header: row[header[letter]] = value else: row[letter] = value value = '' if el.tag.endswith('}row'): rows.append(row) row = {} z.close() return header, rows
def readXlsx(fileName,**args): import zipfile from xml.etree.ElementTree import iterparse if "sheet" in args: sheet=args["sheet"] else: sheet=1 if "header" in args: isHeader=args["header"] else: isHeader=False rows = [] row = {} header = {} z=zipfile.ZipFile(fileName) # Get shared strings strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t')] value = '' # Open specified worksheet for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml'%(sheet))): # get value or index to shared strings if el.tag.endswith('}v'): # <v>84</v> value = el.text if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c> # If value is a shared string, use value as an index if el.attrib.get('t') == 's': value = strings[int(value)] # split the row/col information so that the row leter(s) can be separate letter = el.attrib['r'] # AZ22 while letter[-1].isdigit(): letter = letter[:-1] # if it is the first row, then create a header hash for the names # that COULD be used if rows ==[]: header[letter]=value else: if value != '': # if there is a header row, use the first row's names as the row hash index if isHeader == True and letter in header: row[header[letter]] = value else: row[letter] = value value = '' if el.tag.endswith('}row'): rows.append(row) row = {} z.close() return rows
def main(): limited_tags = ['jquery','javascript','python'] con = lite.connect('bigdata.db') ## tree = ET.parse('Posts.xml') ## root = tree.getroot() # get an iterable context = iterparse('Posts.xml', events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() with con: # Commented sections below create a separate table for tags #tags_dict = {} cur = con.cursor() cur.execute("CREATE TABLE SO(Id INTEGER PRIMARY KEY ASC, Tags TEXT, CreationDate TEXT, UserID INTEGER)") #cur.execute("CREATE TABLE TAGS(Id INTEGER PRIMARY KEY ASC, Tag TEXT)") #tag_id = 0 for event, child in context: if event == "end" and 'Title' in child.attrib and 'OwnerUserId' in child.attrib and (limited_tags[0] in child.attrib['Tags'] or limited_tags[1] in child.attrib['Tags'] or limited_tags[2] in child.attrib['Tags']): sqlQuery = "INSERT INTO SO VALUES(?,?,?,?)" cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Tags'],child.attrib['CreationDate'],child.attrib['OwnerUserId'])) # tags = child.attrib['Tags'].replace('<','').split('>')[:-1] # for tag in tags: # if not tag in tags_dict: # tags_dict[tag] = tag_id # tag_id+=1 root.clear() # sqlQuery = "INSERT INTO TAGS VALUES(?,?)" # for tag in tags_dict: # cur.execute(sqlQuery,(tags_dict[tag],tag)) # get an iterable context = iterparse('Users.xml', events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() with con: cur = con.cursor() cur.execute("CREATE TABLE USERS(UserID INTEGER PRIMARY KEY ASC, Location TEXT)") for event, child in context: if event == "end" and 'Location' in child.attrib: sqlQuery = "INSERT INTO USERS VALUES(?,?)" cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Location'])) root.clear()
def wait_for_new_job(sasl_token): # https://developers.google.com/cloud-print/docs/rawxmpp import ssl, socket from xml.etree.ElementTree import iterparse, tostring xmpp = ssl.wrap_socket(socket.socket()) xmpp.connect(("talk.google.com", 5223)) parser = iterparse(xmpp, ("start", "end")) def msg(msg=" "): xmpp.write(msg) stack = 0 for event, el in parser: if event == "start" and el.tag.endswith("stream"): continue stack += 1 if event == "start" else -1 if stack == 0: assert ( not el.tag.endswith("failure") and not el.tag.endswith("error") and not el.get("type") == "error" ), tostring(el) return el msg('<stream to="gmail.com" version="1.0" xmlns="http://etherx.jabber.org/streams">') msg('<auth xmlns="urn:ietf:params:xml:ns:xmpp-sasl" mechanism="X-GOOGLE-TOKEN">%s</auth>' % sasl_token) msg('<s:stream to="gmail.com" version="1.0" xmlns:s="http://etherx.jabber.org/streams" xmlns="jabber:client">') iq = msg('<iq type="set"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>Armooo</resource></bind></iq>') bare_jid = iq[0][0].text.split("/")[0] msg( '<iq type="set" to="%s"><subscribe xmlns="google:push"><item channel="cloudprint.google.com" from="cloudprint.google.com"/></subscribe></iq>' % bare_jid ) return msg()
def upgrade_nrml(directory, dry_run): """ Upgrade all the NRML files contained in the given directory to the latest NRML version. Works by walking all subdirectories. WARNING: there is no downgrade! """ for cwd, dirs, files in os.walk(directory): for f in files: path = os.path.join(cwd, f) if f.endswith('.xml'): ip = iterparse(path) try: fulltag = ip.next()[1].tag xmlns, tag = fulltag.split('}') except: # not a NRML file pass if xmlns[1:] == NRML05: # already upgraded pass elif 'nrml/0.4' in xmlns and 'vulnerability' in f: if not dry_run: print('Upgrading', path) try: upgrade_file(path) except Exception as exc: print(exc) else: print('Not upgrading', path) ip._file.close()
def parse_and_remove(filename,path): path_parts=path.split('/') doc = iterparse(filename,('start', 'end')) #skip the root element next(doc) tag_stack = [] elem_stack = [] for event, elem in doc: if event == 'start': tag_stack.append(elem.tag) elem_stack.append(elem) print("start.\n") print("tag_stack:",tag_stack,"\n") print("elem_stack",elem_stack,"\n") elif event == 'end': if tag_stack == path_parts: print("end.\n") print("elem:",elem) yield elem print("elem_stack[-2]",elem_stack[-2]) elem_stack[-2].remove(elem) try: tag_stack.pop() elem_stack.pop() except IndexError: pass
def loadScheme(self): que = [] scheme = self.feed.output_scheme map_file = self.feed.map_rules if self.feed.map_rules else '' if scheme == None: return filepath = os.path.join("schemas",scheme, "schema.xml") for (event, node) in iterparse(filepath, ['start', 'end']): if event == 'end': que.pop() if event == 'start': que.append(node.tag) if not list(node): o = struct() o.xpath = "/".join(que[1:]) o.tag = node.tag o.desc = node.text self.schema.append(o) else: if len(que) == 1: o = struct() o.xpath = "/".join(que) o.tag = node.tag self.schema_root = o elif len(que) == 2: o = struct() o.xpath = "/".join(que) o.tag = node.tag self.schema_container = o
def xml2sqlite(bron): cur=conn.cursor() cur.execute("drop table if exists elems") cur.execute("drop table if exists attrib") cur.execute("Create table elems (id integer,id_parent integer,tag varchar(100),text varchar(100), attrib text)") cur.execute("Create table attrib (id integer,id_elem integer,name varchar(100),value varchar(100))") niveau=0;num_0=0;eventvorig='init';attrib=0;attribvorig=0; for (event, node) in iterparse(bron, ['start', 'end', 'start-ns', 'end-ns']): if event=='end': niveau=niveau-1 if event=='start': exec "numvorig_%s=num_%s"%(niveau,niveau) exec "elemvorig=niveau*10000000+numvorig_%s"%(niveau) niveau=niveau+1 exec "if 'num_%s' not in locals(): num_%s=0"%(niveau,niveau) exec "num_%s=num_%s+1"%(niveau,niveau) exec "elem=niveau*10000000+num_%s"%(niveau) tag=node.tag if node.text: text=node.text.rstrip() else: text='' if node.keys(): dict1={} for name in node.keys(): attribvorig=attrib;attrib=attrib+1 value=node.attrib.get(name) cur.execute("insert into attrib values ('%s','%s','%s','%s')"%(attrib,elem,name,value)) dict1[name]=value attrib2=str(dict1) else: attrib2='' cur.execute("insert into elems values ('%s','%s','%s','%s',\"%s\")"%(elem,elemvorig,tag,text,attrib2)) eventvorig=event
def parse_and_remove(filename, out): doc = iterparse(filename, ('start', 'end')) categories = {} questions = {} for event, elem in doc: if event == 'end': if elem.tag == 'message': if 'QID' in elem.text and 'TITLE' in elem.text and 'BODY' in elem.text and 'CATEGORY' in elem.text: start_ind = elem.text.rfind('CATEGORY:') if start_ind != -1: cat = elem.text[start_ind+len('CATEGORY:'):].strip() if not categories.get(cat): categories[cat] = 1 questions[cat] = [elem.text] else: categories[cat] += 1 questions[cat].append(elem.text) print(categories) with open(out, 'w') as outfile: for item in sorted(questions.items(), key=lambda x: x[0]): outfile.write('***%s***\n' % item[0]) for q in item[1]: outfile.write('%s\n' % q)
def is_attrib_unique(filename,attrib): id_count=0 unique_id=0 dupe_id=0 ids = defaultdict(int) # First fill a dictionary with count for unique id for (_, node) in iterparse(filename, ['start',]): if node.tag == 'way' or node.tag == 'node': for attr in dict(node.attrib): if attr == attrib: id_count += 1 ids[node.attrib[attrib]] += 1 node.clear() # Then, if count > 1 then there is a duplicate for k,v in ids.items(): if v>1: dupe_id += 1 else: unique_id += 1 print 'Uid found: ' + str(id_count) print 'Unique uids: ' + str(unique_id) print 'Duplicate uids: ' + str(dupe_id) if dupe_id == 0: return True else: return False
def wait_for_new_job(sasl_token): # https://developers.google.com/cloud-print/docs/rawxmpp import ssl, socket from xml.etree.ElementTree import iterparse, tostring xmpp = ssl.wrap_socket(socket.socket()) xmpp.connect(('talk.google.com', 5223)) parser = iterparse(xmpp, ('start', 'end')) def msg(msg=' '): xmpp.write(msg) stack = 0 for event, el in parser: if event == 'start' and el.tag.endswith('stream'): continue stack += 1 if event == 'start' else -1 if stack == 0: assert not el.tag.endswith('failure') and not el.tag.endswith('error') and not el.get('type') == 'error', tostring(el) return el msg('<stream:stream to="gmail.com" xml:lang="en" version="1.0" xmlns:stream="http://etherx.jabber.org/streams" xmlns="jabber:client">') msg('<auth xmlns="urn:ietf:params:xml:ns:xmpp-sasl" mechanism="X-GOOGLE-TOKEN" auth:allow-generated-jid="true" auth:client-uses-full-bind-result="true" xmlns:auth="http://www.google.com/talk/protocol/auth">%s</auth>' % sasl_token) msg('<stream:stream to="gmail.com" xml:lang="en" version="1.0" xmlns:stream="http://etherx.jabber.org/streams" xmlns="jabber:client">') iq = msg('<iq type="set" id="0"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>Armooo</resource></bind></iq>') bare_jid = iq[0][0].text.split('/')[0] msg('<iq type="set" id="2"><session xmlns="urn:ietf:params:xml:ns:xmpp-session"/></iq>') msg('<iq type="set" id="3" to="%s"><subscribe xmlns="google:push"><item channel="cloudprint.google.com" from="cloudprint.google.com"/></subscribe></iq>' % bare_jid) return msg()
def __getitem__(self, identifier): """ Access the item with id 'identifier' in the file by iterating the xml-tree. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ old_pos = self.file_handler.tell() self.file_handler.seek(0, 0) mzml_iter = iter(iterparse(self.file_handler, events=['end'])) while True: event, element = next(mzml_iter) if event == 'end': if element.tag.endswith('}spectrum'): if int(regex_patterns.SPECTRUM_ID_PATTERN.search( element.get('id')).group(1)) == identifier: self.file_handler.seek(old_pos, 0) return spec.Spectrum(element, measured_precision=5e-6) elif element.tag.endswith('}chromatogram'): if element.get('id') == identifier: self.file_handler.seek(old_pos, 0) return spec.Chromatogram( element, measured_precision=5e-6 )
def testCHelper(self): log.setLevel(llC) _log.info("------- Helper test --------") a = "{kuku}lala" helper = metadata.HELPER[metadata.GET_TAG] self.assertEqual(helper(a), "lala") helper = metadata.HELPER[metadata.SPLIT_TAG] self.assertEqual(helper(a), ["kuku", "lala"]) e = Element('test', {'{a}b':'c','{d12}d':'{e}[]','{d?"?E#}f':'{}g'}) helper = metadata.HELPER[metadata.SPLIT_ATTRIBS] res = helper(e) for k, v in e.attrib.items(): t = k.split("}")[1] self.assertEqual(v,res[t]) e.text = " test\nlala " helper = metadata.HELPER[metadata.GET_TEXT] self.assertEqual(helper(e), " test\nlala ") e.text = " \n " self.assertEqual(helper(e), "") e.text = "" c = SubElement(e, metadata.TAG_NAME) c.text = "This text describes the parent" helper = metadata.HELPER[metadata.TAG_SUPPRESS] self.assertEqual(helper(c, e), ('test', 'This text describes the parent')) inputData = StringIO("<root><Welookfor><Name>data</Name></Welookfor></root>") events = ("start", "end") iterator = iterparse(inputData, events=events) for event, elem in iterator: if event == "end": if elem.tag == "Name": self.assertEqual(helper(elem, iterator.next()[1]), ('Welookfor', 'data'))
def parseXML(self, file_xml, folder_conteudo, export_version): """Parse XML. https://github.com/zikzakmedia/python-mediawiki """ context = aq_inner(self.context) utils = getToolByName(context, 'plone_utils') NS = '{http://www.mediawiki.org/xml/export-' + export_version + '/}' conteudo = [] with open(file_xml.name) as f: for event, elem in iterparse(f): if elem.tag == '{0}page'.format(NS): title = elem.find("{0}title".format(NS)) contr = elem.find(".//{0}username".format(NS)) text = elem.find(".//{0}text".format(NS)) if (title is not None) and (contr is not None) and (text is not None): text = unicode(text.text).encode('utf-8') text = wiki2html(text, True) conteudo.append(dict(title=title.text, contr=contr.text, text=text)) elem.clear() self.createDocument(conteudo, folder_conteudo) msg = 'Procedimento executado.' utils.addPortalMessage(msg, type='info')
def show_all_event(): """event-based parsing""" from xml.etree.ElementTree import iterparse depth = 0 prefix_width = 8 prefix_dots = '.' * prefix_width line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}' for (event, node) in iterparse('podcasts.opml', ['start', 'end', 'start-ns', 'end-ns']): if event == 'end': depth -= 1 prefix_len = depth * 2 print line_template.format(prefix=prefix_dots, prefix_len=prefix_len, suffix='', suffix_len=(prefix_len - prefix_len), node=node, node_id=id(node), event=event) if event == 'start': depth += 1
def upgrade_nrml(directory, dry_run): """ Upgrade all the NRML files contained in the given directory to the latest NRML version. Works by walking all subdirectories. WARNING: there is no downgrade! """ for cwd, dirs, files in os.walk(directory): for f in files: path = os.path.join(cwd, f) if f.endswith('.xml'): ip = iterparse(path, events=('start',)) next(ip) # read node zero try: fulltag = next(ip)[1].tag # tag of the first node xmlns, tag = fulltag.split('}') except: # not a NRML file xmlns, tag = '', '' if xmlns[1:] == NRML05: # already upgraded pass elif 'nrml/0.4' in xmlns and ( 'vulnerability' in tag or 'fragility' in tag or 'sourceModel' in tag): if not dry_run: print('Upgrading', path) try: upgrade_file(path) except Exception as exc: raise print(exc) else: print('Not upgrading', path) ip._file.close()
def read_xml_file(file_name, base_trace_num=0): ''' Read the xml file and return the root element and a dictionary tuple The Dictionary from element to tuple of line_num and trace id ''' if file_name is None or file_name == '': return (None,{}) file = FileWithLineNum(open(file_name)) out_dict = {} root_element = None trace_nums = [0]*102 # Can't conceive of having > 100 levels trace_nums[0] = base_trace_num trace_idx = -1 for event, element in iterparse(file, events=["start", "end"]): if root_element is None: root_element = element if event == "start": trace_idx += 1 trace_nums[trace_idx] += 1 trace_id = '.'.join([ str(x) for x in trace_nums[:trace_idx+1]]) out_dict[element] = (file.line_num, trace_id) #print out_dict[element] else: trace_nums[trace_idx+1] = 0 # Restart one level up trace_idx -= 1 return (root_element, out_dict)
def parse_and_remove(self, filename, path): print('********') from xml.etree.ElementTree import iterparse path_parts = path.split('/') doc = iterparse(filename, ('start', 'end')) # Skip the root element print(path_parts) next(doc) tag_stack = [] elem_stack = [] for event, elem in doc: print(event) print(elem) if event == 'start': tag_stack.append(elem.tag) elem_stack.append(elem) elif event == 'end': if tag_stack == path_parts: yield elem elem_stack[-2].remove(elem) try: tag_stack.pop() elem_stack.pop() except IndexError as e: print(e) pass
def importXML(path): header = open(path).readline() start = header.find('xmlns=')+7 NS = "{%s}" % header[start: header.find('\"', start)] allInfo=[] #to store all the concised info myBase='' #to store the base web with open(path) as f: for event, elem in iterparse(f): # print elem.tag #each elem has its own tag if elem.tag == '{0}base'.format(NS): myBase = str(elem.text) if elem.tag == '{0}page'.format(NS): title = elem.find("{0}title".format(NS)) contr = elem.find(".//{0}username".format(NS)) content = elem.find(".//{0}text".format(NS)) token_dic={} #to parse the content into many tokens and store in the dictionary if content is not None: tokenizer = RegexpTokenizer(r'\w+') #so can get rid of punctuation # print tokenizer.tokenize(content.text) for eachword in tokenizer.tokenize(content.text): try: token_dic[eachword.lower()] += 1 except: token_dic[eachword.lower()] = 1 allInfo.append((title.text, token_dic, content.text)) elem.clear() return myBase, allInfo
def read_corpus(corpus_file_path, sections=['text']): for event, elem in iterparse(corpus_file_path): if elem.tag == 'item': values = [elem.find(section).text for section in sections] if not all(values): continue rating_text = elem.find('rating') if rating_text is not None: rating_text = rating_text.text rating = float(rating_text.strip()) if rating < 3: label = 0 else: label = 1 else: rating_text = elem.find('polarity') if rating_text is None: label = -1 elif rating_text.text.strip() == 'N': label = 0 else: label = 1 yield values, label
def unpack( xml ): for (event, elem) in iterparse(xml, ['start', 'end', 'start-ns', 'end-ns']): if event == 'end': if elem.tag == FOLDER: os.chdir(os.pardir) if event == 'start': print "working for ...", elem.attrib[NAME] if elem.tag == FILE: size = int(elem.attrib[SIZE]) block = size / contentLength remdr = size % contentLength file = open(elem.attrib[NAME], 'a') for blockIndex in range(0, block): file.write(content) for remdrIndex in range(0, remdr): file.write("X") file.close() if elem.tag == FOLDER: os.mkdir(elem.attrib[NAME]) os.chdir(elem.attrib[NAME]) if elem.tag == ROOT: shutil.rmtree(elem.attrib[NAME], ignore_errors=True) os.mkdir(elem.attrib[NAME]) os.chdir(elem.attrib[NAME]) return 0;
def scan_eix_xml(self, query, category=None): cmd = ['eix', '--xml'] env = os.environ env['XML_OVERLAY'] = 'true' if query: cmd.extend(['--exact', query]) if category: cmd.extend(['-C', category]) sub = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE) output = sub.stdout try: parser = iterparse(output, ["start", "end"]) parser.next() # read root tag just for testing output except ParseError: if query: msg = "Unknown package '%s'" % query else: msg = "No packages." self.logger.error(self.style.ERROR(msg)) return package = {'versions': []} category = "" for event, elem in parser: if event == "start": # on tag opening if elem.tag == "category": category = elem.attrib["name"] elif elem.tag == "package": package["package"] = elem.attrib["name"] package["category"] = category elif elem.tag in ["description", "homepage"]: package[elem.tag] = elem.text or "" elif elem.tag == "version": # append version data to versions cpv = "%s/%s-%s" % ( package["category"], package["package"], elem.attrib["id"] ) slot = elem.attrib.get("slot", "0") overlay = elem.attrib.get("repository", "gentoo") overlay_path = elem.attrib.get("overlay", None) package["versions"].append( (cpv, slot, overlay, overlay_path) ) elif event == "end": # on tag closing if elem.tag == "package": # clean old data yield package package = {"versions": []} if elem.tag == "category": # clean old data category = "" elem.clear()
def get_concept_treatment_info(cid): response = get_data("get", cid) diseases = [] # Beatiful Soup version soup = BeautifulSoup(response, "xml") for tag in soup.find_all('role'): #, recursive=False): for tag_inner in tag.children: #print tag_inner.name if tag_inner.name == "roleName": word = tag_inner.string if word.startswith("may_treat"): flag = 0.5 elif word.startswith("treats"): flag = 1 else: flag = 0 elif tag_inner.name == "concept": for child in tag_inner.children: #print child.name if child.name == "conceptName": disease = child.string if child.name == "conceptKind": if child.string.strip() != "DISEASE_KIND": flag = 0 if child.name == "conceptNui": disease_cid = child.string if flag > 0: diseases.append((disease, disease_cid, flag)) return diseases # ElemTree version context = iterparse(response, ["start", "end"]) context = iter(context) event, root = context.next() NS = "" state_stack = [ root.tag ] flag = False for (event, elem) in context: if event == "start": state_stack.append(elem.tag) if elem.tag == NS+"role": flag = False elif event == "end": if elem.tag == NS+"roleName": if state_stack[-2] == NS+"role": word = elem.text if word.startswith("may_treat") or word.startswith("treats"): flag = True else: flag = False if elem.tag == NS+"conceptName": if state_stack[-2] == NS+"concept" and state_stack[-3] == NS+"role": if flag: diseases.append(elem.text) elem.clear() state_stack.pop() root.clear() return diseases
def get_items(self): self.input_file.seek(0) for event, elem in iterparse(self.input_file): if elem.tag == 'item': out = parse_post(elem) out['comments'] = get_comments(elem) out['categories'] = get_categories(elem) yield out elem.clear()
def sax_parse(self, filename): self.root_values = {} self.tree = [] stack = [] values = {} matrix = None for event, elem in iterparse(filename, events=('start', 'end')): if event == 'start': stack.append((values, matrix)) if matrix is not None: matrix = matrix.copy() # copy of matrix current_values = values values = {} values.update(current_values) # copy of dictionary attrs = elem.attrib values.update(attrs) name = elem.tag[28:] if "style" in attrs: for equate in attrs["style"].split(";"): equal_item = equate.split(":") values[equal_item[0]] = equal_item[1] if "transform" in attrs: transform_matrix = parse_transform(attrs["transform"]) if matrix is None: matrix = np.identity(3) matrix = transform_matrix.dot(matrix) if "svg" == name: current_values = values values = {} values.update(current_values) self.root_values = current_values continue elif "g" == name: continue elif 'path' == name: values['d'] = path2pathd(values) elif 'circle' == name: values["d"] = ellipse2pathd(values) elif 'ellipse' == name: values["d"] = ellipse2pathd(values) elif 'line' == name: values["d"] = line2pathd(values) elif 'polyline' == name: values["d"] = polyline2pathd(values['points']) elif 'polygon' == name: values["d"] = polygon2pathd(values['points']) elif 'rect' == name: values["d"] = rect2pathd(values) else: continue values["matrix"] = matrix values["name"] = name self.tree.append(values) else: v = stack.pop() values = v[0] matrix = v[1]
def parse(): paper_id = 0 with open("CoAuthor.csv", "w") as f: w = csv.writer(f) w.writerow(["names"]) for event, elem in iterparse('dblp.xml', events=['start'], parser=parser): if elem.tag in paper_tags(): title = "" year = -1 platform = "" # print "****Paper****" for t in elem.findall('title'): if valid_title(t.text): title = strip_comma(t.text) break for y in elem.findall('year'): if valid_text(y.text): year = y.text break for p in elem.findall('journal') or elem.findall('booktitle'): if valid_platform(p.text): platform = strip_comma(p.text) if platform not in dict_platform: dict_platform.add(platform) if valid_title(title): publishat.append(PublishAt(paper_id, platform, "PublishAt")) # publishes.append(PublishAt(platform, paper_id, "Publishes")) break authors = [] for a in elem.findall('author'): if valid_name(a.text): author = a.text if author not in dict_researcher: dict_researcher.add(author) authors.append(author) if valid_title(title): authorof.append(AuthorOf(author, paper_id, "AuthorOf")) # writtenby.append(WrittenBy(paper_id, author, "WrittenBy")) w.writerow(authors) if valid_title(title): papers.append(Paper(paper_id, title, year, 0.0, "Paper")) paper_id += 1 elem.clear() f.close()
def parse(self, xml_stream): try: for event, element in iterparse(xml_stream, self.events): if event == 'start': self.startElement(element) else: for instance in self.yieldInstances(element): yield instance except struct.error: logger.exception('compressed gzip file is corrupt')
def readXlsx(file, **args): # from: Hooshmand zandi http://stackoverflow.com/a/16544219 import zipfile from xml.etree.ElementTree import iterparse if "sheet" in args: sheet = args["sheet"] else: sheet = 1 if "header" in args: isHeader = args["header"] else: isHeader = False rows = [] row = {} header = {} z = zipfile.ZipFile(file) # Get shared strings strings = [ el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t') ] value = '' # Open specified worksheet for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml' % (sheet))): # get value or index to shared strings if el.tag.endswith('}v'): # <v>84</v> value = el.text if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c> # If value is a shared string, use value as an index if el.attrib.get('t') == 's': value = strings[int(value)] # split the row/col information so that the row leter(s) can be # separate letter = el.attrib['r'] # AZ22 while letter[-1].isdigit(): letter = letter[:-1] # if it is the first row, then create a header hash for the names # that COULD be used if rows == []: header[letter] = value.strip() else: if value != '': # if there is a header row, use the first row's names as # the row hash index if isHeader == True and letter in header: row[header[letter]] = value else: row[letter] = value value = '' if el.tag.endswith('}row'): rows.append(row) row = {} z.close() return [header, rows]
#!/home/jinho93/miniconda3/bin/python from xml.etree.ElementTree import iterparse import sh import numpy as np import sys out = str(sh.perl('/home/jinho93/bin/checkforce', '-v')) arr = [r.split() for r in out.split('\n')[:-4]] selective = [] for event, elem in iterparse('vasprun.xml', events=('end', )): if 'name' in elem.attrib.keys() and elem.attrib['name'] == 'selective': for i in elem: selective.append( [True if r is 'T' else False for r in str(i.text).split()]) break for i, j in zip(selective, arr): if j: for k in range(3): if not i[k]: j[3 + k] = 'conv' print(' '.join(j))
"LastJobStatus", "DiskUsage" # KB ] if len(sys.argv) != 3: print("Se requieren dos argumentos, el archivo que tiene el historico en XML y el archivo de salida en CSV") sys.exit(-1) filenameXML = sys.argv[1] if not os.path.isfile(filenameXML): print("No se encontro archivo [%s]"%(filenameXML)) sys.exit(-1) filenameCSV = sys.argv[2] doc=iterparse(filenameXML,('start','end')) csvfile = open(filenameCSV,"w") writer = csv.DictWriter(csvfile, fieldnames = tags) writer.writeheader() # evita el primer encabezado next(doc) # Totales numTasks = 0 totalBytesSent = 0 totalBytesRecv = 0 totalElapsedTime = 0 diskUsed = 0 # Parciales por tarea user="" completionDate = 0 jobCurrentStartDate = 0
for zipcode, num in potholes_by_zip.most_common(): print(zipcode, num) from collections import Counter potholes_by_zip = Counter() data = parse_and_remove('potholes.xml', 'row/row') for pothole in data: potholes_by_zip[pothole.findtext('zip')] += 1 for zipcode, num in potholes_by_zip.most_common(): print(zipcode, num) data = iterparse('potholes.xml',('start','end')) next(data) next(data) next(data) next(data) next(data) next(data) next(data) elem_stack[-2].remove(elem)
def convert_to_mm(self, inputfile,outputfile): """write output .mm file""" # Create the tree self.mm and add the map element self.mm = ET.Element("map",version="1.5.9") depth = 0 # Iterate through the opml file looking for 'outline' tags for (event, node) in iterparse(inputfile, ['start', 'end', 'start-ns', 'end-ns']): # end of outline tag encountered if event == 'end': if node.tag=='outline': # drop back a level depth -= 1 # start of outline tag encountered if event == 'start' and node.tag=='outline': #bump the depth depth += 1 # get the outline tags text # may be in the node.text field or the text attribute if node.text==None or node.text.strip()=='': try: nodetext=node.attrib['text'].strip() except: nodetext='' else: nodetext=node.text.strip() # log where we're at print depth*' ',depth,'Added',node.tag,'text => '+nodetext+'' # if at new level create a node element if depth > self.previous_level: self.nodetree.append("") attributes={} attributes['TEXT']=nodetext self.nodetree[depth] = ET.SubElement(self.nodetree[depth-1], "node",attrib=attributes) # if theres a note in the 'outline' tag ie attribute with tag '_node' # create the note element try: # obtain note node_note=node.attrib['_note'] # remove any non ascii characters to avoid unicode problems # node_note=self.removeNonAscii(node_note) except: # couldn't get a note for this node so set blank node_note='' #if we have a note then add the richcontent element Freemind and Freeplane expect if node_note<>'': try: # create richnote tag with note details embedded attributes={} attributes['TYPE']='DETAILS' note_element='<html><head></head><body>'+ \ node_note + \ '</body></html>' self.nodetree[depth] = ET.SubElement(self.nodetree[depth], "richcontent",attrib=attributes) # inserting the note into node tag # if note contains html and it is valid note is added as html # # however if ElementTree rejects note due to parsing errors # such as badly formed html then the exception below will be # triggered and note is added with raw 'escaped' text # for example <b> is <:b> self.nodetree[depth].append(ET.fromstring(note_element)) # log result print depth*' ','++ Added Note',node_note except: # ElementTree could not parse the opml note in the current outline tag # so no note is added # note data is invalid xml so add the note data as xml CDATA tag print '!!Warning: Invalid data. Note added as raw character data\nNote data=',node_note # unescape html characters to avoid clashes with Freemind/Freeplane parsers node_note=HTMLParser.unescape.__func__(HTMLParser, node_note) # escape utf-8 characters node_note=escape(node_note).encode('ascii', 'xmlcharrefreplace') # remove any non ASCII characters from note # node_note=self.removeNonAscii(node_note) # wrap escaped note in CDATA tag # note_element='<html><head></head><body>'+ \ # '<![CDATA['+ \ # node_note + \ # ']]>' + \ # '</body></html>' # note_element='<html><head></head><body>'+ \ node_note + \ '</body></html>' self.nodetree[depth].append(ET.fromstring(note_element)) else: # finished at current level so jump back a level self.previous_level = depth-1 if event == 'start' and node.tag=='title': # log title found print 'Added tag ',node.tag,'==>',node.text # add title tag as the first node self.nodetree[0]=ET.SubElement(self.mm, "node", attrib={'TEXT':node.text}) # get the output data tree = ET.ElementTree(self.mm) root=tree.getroot() outputdata=ET.tostring(root) # print outputdata # create the output .mm file f=open(outputfile,'w') f.write(self.removeNonAscii(outputdata)) f.close() return
def convert_xml2csv(self, csv_file, xmlfile): csvfile = open(csv_file, 'wb') spamwriter = csv.writer(csvfile, dialect='excel', delimiter=';') spamwriter.writerow([ 'TCID', 'CASE_NAME', 'IMPORTANCE', 'STATUS', 'SUMMARY', 'STEP', 'Result' ]) for (event, node) in iterparse(xmlfile, events=['end']): if node.tag == "testcase": case_list = ['', '', '', '', '', '', ''] steps_list = ['', '', '', '', '', '', ''] case_list[1] = node.attrib['name'] for child in node: if child.tag == "externalid": text = re.sub('\n|<p>|</p>|\t', '', str(child.text)) # print self.strip_tags(text) TCID = self.strip_tags(text) elif child.tag == "summary": text = re.sub('\n|<p>|</p>|\t', '', str(child.text)) # print self.strip_tags(text) case_list[4] = self.strip_tags(text) elif child.tag == "importance": # text = re.sub('\n|<p>|</p>|\t', '', str(child.text)) # print self.strip_tags(text) case_list[2] = importance_map[ int(self.strip_tags(child.text)) - 1] elif child.tag == "status": # text = re.sub('\n|<p>|</p>|\t', '', str(child.text)) # print self.strip_tags(text) case_list[3] = status_map[ int(self.strip_tags(child.text)) - 1] if "steps" not in [item.tag for item in node]: case_list[0] = TCID spamwriter.writerow(case_list) break elif child.tag == "steps": if len(child) > 0: for i in range(len(child)): if i == 0: for n in range(len( child.getchildren()[i])): if child.getchildren()[i].getchildren( )[n].text is not None: text = self.strip_tags( child.getchildren() [i].getchildren() [n].text).encode('UTF-8') else: text = '' # print text if child.getchildren()[i].getchildren( )[n].tag == 'actions': case_list[5] = text elif child.getchildren( )[i].getchildren( )[n].tag == 'expectedresults': case_list[6] = text case_list[0] = TCID spamwriter.writerow(case_list) else: for n in range(len( child.getchildren()[i])): if child.getchildren()[i].getchildren( )[n].text is not None: text = self.strip_tags( child.getchildren() [i].getchildren() [n].text).encode('UTF-8') else: text = '' # print text if child.getchildren()[i].getchildren( )[n].tag == 'actions': steps_list[5] = text elif child.getchildren( )[i].getchildren( )[n].tag == 'expectedresults': steps_list[6] = text steps_list[0] = TCID spamwriter.writerow(steps_list) csvfile.close()
def skip_exceptions(it): while True: try: yield next(it) except StopIteration: raise except Exception as e: logging.info( 'Skipping iteration because of exception {}'.format(e)) try: count = 0 for evt, elem in skip_exceptions( iterparse('pmc_result_sm.xml')): # , events=('start', 'end')): if elem.tag == 'article': try: output = extract_text2(elem) if (len(output[0]) < 50) | (len(output[1]) < 1): print("too short or abstract error") else: count += 1 print('Article found. Count = ' + str(count)) with open("text//title_to_text.txt", 'a+') as text_file: text_file.write(output[0].lower().lstrip() + '\n') with open("text//abstract_to_text.txt", 'a+') as text_file: text_file.write(output[1].lower() + '\n')
def generate_page(self, document="Q", page="2r.json"): """Generate a single display page.""" self.past_first_chapter_div = False self.past_first_ab = False self.column_number = 0 self.current_main_column = 'a' self.current_subcolumn = None self.subcolumn = False self.waiting_for_column = [] self.in_rubric = False self.app_pos = 0 self.app_open = False self.choice_pos = 0 self.choice_open = False self.choice_hovers = [] self.ex_open = False self.ex_text = [] self.am_open = False self.am_text = [] self.amex_pos = 0 self.abbr_open = False self.expan_open = False filename = os.path.join(self.page_location, document, page) with open(filename, encoding="utf-8") as file_p: data = json.load(file_p) self.column_structure = self.count_columns(data, document, page) if data['text']: cleaned = self.process_app(data['text'].replace('\n', ''), document, page) cleaned = self.process_choice(cleaned, document, page) else: cleaned = data['text'] datastream = io.StringIO(cleaned) # iterparse is not deprecated # https://github.com/PyCQA/pylint/issues/947 # pylint: disable=deprecated-method parser = iterparse(datastream, events=("start", "end")) output_text = [] for event, element in parser: try: new_text = getattr(self, "process_%s_%s" % (event, element.tag))(element) except AttributeError: if self.debug: print("Skipping %s." % element.tag) if element.text: self.update_text(output_text, element.text) #output_text += element.text new_text = None else: if new_text == NO_TAIL: pass elif new_text: self.update_text(output_text, new_text) #output_text += new_text if element.tail and event == 'end': if new_text != NO_TAIL: self.update_text(output_text, element.tail) #output_text += element.tail if self.expanded: data['html'] = ''.join(output_text) else: data['html_abbrev'] = ''.join(output_text) with open(filename, 'w', encoding="utf-8") as file_p: json.dump(data, file_p, ensure_ascii=False, indent=4)
# # Connect to DB # conn_string = 'host=localhost dbname=' + database + ' user='******' password='******' port=' + dbPort #print conn_string conn = psycopg2.connect(conn_string) cursor = conn.cursor() print "\n\nConnected to database", database, "on localhost" depth = 0 for (event, node) in iterparse(cloudhistoryxmlpath, ['start', 'end']): # # Start Event in iterparse <some tag> if event == 'end': print "\n End tag", node.tag, " Previous tag: ",previous_endtag if node.tag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}item" and previous_endtag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}iamInstanceProfile": print "\n\n\n\n END instance previous end event tag",previous_endtag print "All instance data",reservationId \ ,ownerId,groupId \ ,instanceId,imageId,name \ ,privateDnsName,dnsName \ ,keyName,amiLaunchIndex,instanceType \ ,launchTime, availabilityZone \ ,kernelId,ramdiskId \ ,privateIpAddress,ipAddress,groupName \ ,rootDeviceType,rootDeviceName,eucanodeip,virtualizationType
def extract_pages(f, filter_namespaces=False, filter_articles=None): """Extract pages from a MediaWiki database dump. Parameters ---------- f : file File-like object. filter_namespaces : list of str or bool Namespaces that will be extracted. Yields ------ tuple of (str or None, str, str) Title, text and page id. """ elems = (elem for _, elem in iterparse(f, events=("end", ))) # We can't rely on the namespace for database dumps, since it's changed # it every time a small modification to the format is made. So, determine # those from the first element we find, which will be part of the metadata, # and construct element paths. elem = next(elems) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping pageid_path = "./{%(ns)s}id" % ns_mapping for elem in elems: if elem.tag == page_tag: title = elem.find(title_path).text text = elem.find(text_path).text if filter_namespaces: ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if filter_articles is not None: if not filter_articles(elem, namespace=namespace, title=title, text=text, page_tag=page_tag, text_path=text_path, title_path=title_path, ns_path=ns_path, pageid_path=pageid_path): text = None pageid = elem.find(pageid_path).text yield title, text or "", pageid # empty page will yield None # Prune the element tree, as per # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ # except that we don't need to prune backlinks from the parent # because we don't use LXML. # We do this only for <page>s, since we need to inspect the # ./revision/text element. The pages comprise the bulk of the # file, so in practice we prune away enough. elem.clear()
#!/usr/bin/env python """ Code for etsy job application. Requires python 2.5 (or ElementTree http://effbot.org/zone/element-index.htm) Simply execute this file to count the number of users in each city and print out a running total. Bill Mill 12/13/07 http://billmill.org """ from urllib import urlopen from xml.etree.ElementTree import iterparse ids = [42346, 77290, 729] prefix = "http://api.etsy.com/feeds/xml_user_details.php?id=" docs = [urlopen(prefix + str(id)) for id in ids] totals = {} for doc in docs: for event, elem in iterparse(doc): if elem.tag == "city": totals[elem.text] = totals.get(elem.text, 0) + 1 print totals
def parse(self): # get an iterable context = iterparse(self.file_name, ["start", "end"]) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() state_stack = [root.tag] drug_id = None drug_type = None drug_id_partner = None current_target = None resource = None current_property = None target_types = set( map(lambda x: self.NS + x, ["target", "enzyme", "carrier", "transporter"])) target_types_plural = set(map(lambda x: x + "s", target_types)) for (event, elem) in context: if event == "start": state_stack.append(elem.tag) if len(state_stack) <= 2 and elem.tag == self.NS + "drug": if "type" in elem.attrib: drug_type = elem.attrib["type"] else: drug_type = None elif elem.tag == self.NS + "drugbank-id": if "primary" in elem.attrib and state_stack[ -3] == self.NS + "drugbank" and state_stack[ -2] == self.NS + "drug": drug_id = None elif len(state_stack) > 3 and state_stack[ -3] == self.NS + "drug-interactions" and state_stack[ -2] == self.NS + "drug-interaction": drug_id_partner = None elif elem.tag == self.NS + "resource": resource = None elif elem.tag == self.NS + "property": current_property = None elif elem.tag in target_types: if state_stack[-2] in target_types_plural: current_target = None if event == "end": if len(state_stack) <= 2 and elem.tag == self.NS + "drug": if "type" in elem.attrib: drug_type = elem.attrib["type"] else: drug_type = None if elem.tag == self.NS + "drugbank-id": if state_stack[-2] == self.NS + "drug": if "primary" in elem.attrib: drug_id = elem.text self.drugs.add(drug_id.upper()) if drug_type is not None: self.drug_to_type[drug_id] = drug_type #print drug_id, drug_type elif len(state_stack) > 3 and state_stack[ -3] == self.NS + "drug-interactions" and state_stack[ -2] == self.NS + "drug-interaction": self.drug_to_interactions.setdefault(drug_id, {}) drug_id_partner = elem.text if drug_id_partner not in self.drug_to_interactions[ drug_id]: self.drug_to_interactions[drug_id][ drug_id_partner] = [] elif elem.tag == self.NS + "name": if len(state_stack ) <= 3 and state_stack[-2] == self.NS + "drug": self.drug_to_name[drug_id] = elem.text.strip() elif state_stack[-2] == self.NS + "product" and state_stack[ -3] == self.NS + "products": product = elem.text product = product.strip().encode('ascii', 'ignore') if product != "": self.drug_to_products.setdefault( drug_id, set()).add(product) elif state_stack[ -2] == self.NS + "international-brand" and state_stack[ -3] == self.NS + "international-brands": brand = elem.text #idx = brand.find(" [") #if idx != -1: # brand = brand[:idx] brand = brand.strip().encode('ascii', 'ignore') if brand != "": self.drug_to_brands.setdefault(drug_id, set()).add(brand) #elif state_stack[-3] == self.NS+"targets" and state_stack[-2] == self.NS+"target": elif state_stack[-2] == self.NS + "mixture" and state_stack[ -3] == self.NS + "mixtures": mixture = elem.text mixture = mixture.strip().encode('ascii', 'ignore') if mixture != "": self.drug_to_mixtures.setdefault( drug_id, set()).add(mixture) elif state_stack[-3] in target_types_plural and state_stack[ -2] in target_types: self.target_to_name[current_target] = elem.text elif elem.tag == self.NS + "ingredients": if state_stack[-3] == self.NS + "mixtures" and state_stack[ -2] == self.NS + "mixture": ingredients = elem.text ingredients = ingredients.strip().encode( 'ascii', 'ignore') if ingredients != "" and mixture != "": self.mixture_to_ingredients[mixture] = ingredients elif elem.tag == self.NS + "description": if state_stack[-2] == self.NS + "drug": self.drug_to_description[drug_id] = elem.text if len(state_stack) > 3 and state_stack[ -3] == self.NS + "drug-interactions" and state_stack[ -2] == self.NS + "drug-interaction": self.drug_to_interactions[drug_id][ drug_id_partner].append(elem.text) elif elem.tag == self.NS + "group": if state_stack[-2] == self.NS + "groups": self.drug_to_groups.setdefault(drug_id, set()).add(elem.text) elif elem.tag == self.NS + "indication": if state_stack[-2] == self.NS + "drug": self.drug_to_indication.setdefault(drug_id, []) self.drug_to_indication[drug_id].append(elem.text) elif elem.tag == self.NS + "pharmacodynamics": if state_stack[-2] == self.NS + "drug": self.drug_to_pharmacodynamics[drug_id] = elem.text elif elem.tag == self.NS + "mechanism-of-action": if state_stack[-2] == self.NS + "drug": self.drug_to_moa[drug_id] = elem.text elif elem.tag == self.NS + "toxicity": if state_stack[-2] == self.NS + "drug": self.drug_to_toxicity[drug_id] = elem.text elif elem.tag == self.NS + "synonym": if state_stack[-2] == self.NS + "synonyms" and state_stack[ -3] == self.NS + "drug": synonym = elem.text idx = synonym.find(" [") if idx != -1: synonym = synonym[:idx] synonym = synonym.strip().encode('ascii', 'ignore') if synonym != "": self.drug_to_synonyms.setdefault( drug_id, set()).add(synonym) elif elem.tag == self.NS + "category": if state_stack[-2] == self.NS + "categories": self.drug_to_categories.setdefault(drug_id, set()).add( elem.text) elif elem.tag == self.NS + "atc-code": if state_stack[-2] == self.NS + "atc-codes": self.drug_to_atc_codes.setdefault(drug_id, set()).add( elem.attrib["code"]) elif elem.tag == self.NS + "id": if state_stack[-3] in target_types_plural and state_stack[ -2] in target_types: current_target = elem.text self.drug_to_target_to_values.setdefault(drug_id, {}) self.drug_to_target_to_values[drug_id][ current_target] = [state_stack[-2], False, []] #print current_target elif elem.tag == self.NS + "action": if state_stack[-3] in target_types and state_stack[ -2] == self.NS + "actions": self.drug_to_target_to_values[drug_id][current_target][ 2].append(elem.text) elif elem.tag == self.NS + "known-action": if state_stack[-2] in target_types: if elem.text == "yes": self.drug_to_target_to_values[drug_id][ current_target][1] = True if len(self.drug_to_target_to_values[drug_id] [current_target][2]) == 0: #print "Inconsistency with target action: {} {}".format(drug_id, current_target) pass elif elem.tag == self.NS + "gene-name": if state_stack[-3] in target_types and state_stack[ -2] == self.NS + "polypeptide": self.target_to_gene[current_target] = elem.text elif elem.tag == self.NS + "kind": if state_stack[ -3] == self.NS + "calculated-properties" and state_stack[ -2] == self.NS + "property": current_property = elem.text # InChIKey or SMILES elif elem.tag == self.NS + "value": if state_stack[ -3] == self.NS + "calculated-properties" and state_stack[ -2] == self.NS + "property": if current_property == "InChIKey": inchi_key = elem.text # strip InChIKey= if inchi_key.startswith("InChIKey="): inchi_key = inchi_key[len("InChIKey="):] self.drug_to_inchi_key[drug_id] = inchi_key if current_property == "SMILES": self.drug_to_smiles[drug_id] = elem.text elif elem.tag == self.NS + "resource": if state_stack[ -3] == self.NS + "external-identifiers" and state_stack[ -2] == self.NS + "external-identifier": resource = elem.text elif elem.tag == self.NS + "identifier": if state_stack[ -3] == self.NS + "external-identifiers" and state_stack[ -2] == self.NS + "external-identifier": if state_stack[-5] in target_types and state_stack[ -4] == self.NS + "polypeptide": if resource == "UniProtKB": self.target_to_uniprot[ current_target] = elem.text if resource == "UniProt Accession": self.target_to_uniprotentry[ current_target] = elem.text elif state_stack[-4] == self.NS + "drug": if resource == "PubChem Compound": self.drug_to_pubchem[drug_id] = elem.text elif resource == "PubChem Substance": self.drug_to_pubchem_substance[ drug_id] = elem.text elif resource == "ChEBI": self.drug_to_chebi[drug_id] = elem.text elif resource == "ChEMBL": self.drug_to_chembl[drug_id] = elem.text elif resource == "KEGG Drug": self.drug_to_kegg[drug_id] = elem.text elif resource == "KEGG Compound": self.drug_to_kegg_compound[drug_id] = elem.text elif resource == "UniProtKB": self.drug_to_uniprot[drug_id] = elem.text elif resource == "PharmGKB": self.drug_to_pharmgkb[drug_id] = elem.text elem.clear() state_stack.pop() root.clear() return
#! /usr/bin/env/python # -*- coding:utf-8 -*- from xml.etree.ElementTree import iterparse import csv import sys writer =csv.writer(sys.stdout,quoting=csv.QUOTE_NONNUMERIC) group_name = '' for (event,node) in iterparse('podcasts.opml',events=['start']): if node.tag !='outline': # Ignore anything not part of the outline continue if not node.attrib.get('xmlUrl'): # Remember the current group group_name = node.attrib['text'] else: # Output a podcast entry writer.writerow((group_name,node.attrib['text'], node.attrib['xmlUrl'], node.attrib.get('htmlUrl'),'' ) )
def __getitem__(self, identifier): """ Access the item with id 'identifier'. Either use linear, binary or interpolated search. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ ############################################################################# # DOES NOT HOLD IF NUMBERS DONT START WITH ONE AND/OR DONT INCREASE BY ONE # # TODO FIXME # ############################################################################# self.file_handler.seek(0) spectrum = None if str(identifier).upper() == 'TIC': # print(str(identifier).upper()) found = False mzmliter = iter(iterparse(self.file_handler, events=['end'])) while found is False: event, element = next(mzmliter, ('STOP', 'STOP')) if event == 'end': if element.tag.endswith('}chromatogram'): if element.get('id') == 'TIC': found = True spectrum = spec.Chromatogram( element, measured_precision = 5e-6 ) elif event == 'STOP': raise StopIteration elif identifier in self.offset_dict: start = self.offset_dict[identifier] with open(self.path, 'rb') as seeker: seeker.seek(start[0]) start, end = self._read_to_spec_end(seeker) self.file_handler.seek(start, 0) data = self.file_handler.read(end - start) if data.startswith('<spectrum'): spectrum = spec.Spectrum( XML(data), measured_precision = 5e-6 ) elif data.startswith('<chromatogram'): spectrum = spec.Chromatogram( XML(data) ) elif type(identifier) == str: return self._search_string_identifier( identifier ) else: spectrum = self._interpol_search(identifier) return spectrum
def parse_monsters(file, out): m = None parent_tag = '' monster_type = ['', '', ''] monster_passive_perception = 0 monster_entity_name = '' monster_entity_text = [] for event, elem in iterparse(file, ('start', 'end')): tag = elem.tag value = elem.text if elem.text is not None else '' if event == 'start': if tag == 'monster': parent_tag = tag m = make_monster() elif tag == 'trait' or tag == 'action' or tag == 'reaction' or tag == 'legendary': parent_tag = tag monster_entity_text = [] else: if tag == 'monster': m['Type'] = monster_type[SIZE] + ' ' + monster_type[ TYPE] + ', ' + monster_type[ALIGNMENT] m['Senses'].append('passive Perception {0}'.format( monster_passive_perception)) out.append(m) elif tag == 'name': if parent_tag == 'monster': m['Name'] = value else: monster_entity_name = value elif tag == 'size': monster_type[SIZE] = size_convert[value] elif tag == 'type': monster_type[TYPE], m['Source'] = value.rsplit(', ', 1) elif tag == 'alignment': monster_type[ALIGNMENT] = value elif tag == 'ac': m['AC'] = parse_value_notes(value) elif tag == 'hp': m['HP'] = parse_value_notes(value) elif tag == 'speed': m['Speed'] = parse_array(value) elif tag == 'str': m['Abilities']['Str'] = int(value) elif tag == 'dex': m['Abilities']['Dex'] = int(value) elif tag == 'con': m['Abilities']['Con'] = int(value) elif tag == 'int': m['Abilities']['Int'] = int(value) elif tag == 'wis': m['Abilities']['Wis'] = int(value) elif tag == 'cha': m['Abilities']['Cha'] = int(value) elif tag == 'save': m['Saves'] = parse_name_modifier(value) elif tag == 'skill': m['Skills'] = parse_name_modifier(value) elif tag == 'resist': m['DamageResistances'] = parse_array(value, True) elif tag == 'vulnerable': m['DamageVulnerabilities'] = parse_array(value) elif tag == 'immune': m['DamageImmunities'] = parse_array(value, True) elif tag == 'conditionImmune': m['ConditionImmunities'] = parse_array(value) elif tag == 'senses': m['Senses'] = parse_array(value) elif tag == 'passive': monster_passive_perception = int(value) elif tag == 'languages': m['Languages'] = parse_array(value) elif tag == 'cr': m['Challenge'] = value elif tag == 'trait': parent_tag = 'monster' m['Traits'].append({ 'Name': monster_entity_name, 'Content': '<br />'.join(monster_entity_text), 'Usage': '' }) elif tag == 'action': parent_tag = 'monster' m['Actions'].append({ 'Name': monster_entity_name, 'Content': '<br />'.join(monster_entity_text), 'Usage': '' }) elif tag == 'reaction': parent_tag = 'monster' m['Reactions'].append({ 'Name': monster_entity_name, 'Content': '<br />'.join(monster_entity_text), 'Usage': '' }) elif tag == 'legendary': parent_tag = 'monster' m['LegendaryActions'].append({ 'Name': monster_entity_name, 'Content': '<br />'.join(monster_entity_text), 'Usage': '' }) elif tag == 'text': monster_entity_text.append(value) elem.clear()
return 0 # # is Image In DB def ends # # Connect to DB # conn_string = 'host=localhost dbname=' + database + ' user='******' password='******' port=' + dbPort conn = psycopg2.connect(conn_string) cursor = conn.cursor() # iterparse using default end event since end,start caused None child elements context = iterparse(cloudhistoryxmlpath, events=("start", "end")) root = None #for (event, node) in iterparse(cloudhistoryxmlpath, ['start', 'end']): for event, node in context: if event == "start" and root is None: root = node # the first element is root if event == 'end': #print "\n End tag", node.tag if node.tag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}item": if imageNotAlreadyInDb(imageId): insertToDb(sampledatetime,imageId \ ,imageLocation,imageState \ ,imageOwnerId,isPublic,architecture \ ,platform,imageType,name,description \ ,rootDeviceType,rootDeviceName,virtualizationType)
def parse(self, filepath, bin_size=1000000, resume=False): parser = iterparse(filepath, events=('end', 'start')) evt, root = next(parser) types: Tuple[str] = ('entered link', 'left link', 'PersonEntersVehicle', 'PersonLeavesVehicle') links: Dict[str:int] = {} leg_evts: List[Tuple[int, str, int, int]] = list() veh_evts: List[Tuple[int, int, int, int]] = list() leg_id: int = 0 veh_id: int = 0 time: int = 0 bin_count: int = 0 total_count: int = 0 pr.print('Fetching network link data.', time=True) links = dict(self.database.fetch_network()) pr.print('Network link data fetch completed.', time=True) if resume: pr.print('Finding where we left off parsing last.', time=True) leg_id = self.database.get_leg_count() veh_id = self.database.get_veh_count() offset = leg_id + veh_id pr.print(f'Skipping to event {offset} of XML file.', ) else: pr.print('Resuming XML leg/vehicle event parsing.', time=True) pr.print(f'Event Parsing Progress', progress=0, persist=True, replace=True, frmt='bold') for evt, elem in parser: if elem.tag == 'event' and evt == 'end': etype = elem.attrib['type'] if resume and etype in types: bin_count += 1 total_count += 1 if bin_count >= bin_size: time = int(float(elem.attrib['time'])) root.clear() bin_count = 0 pr.print(f'Skipped to event {total_count}.') pr.print(f'Event Parsing Progress', progress=time / 86400, persist=True, replace=True, frmt='bold') if total_count == offset: time = int(float(elem.attrib['time'])) root.clear() bin_count = 0 resume = False pr.print(f'Skipped to event {total_count}.', time=True) pr.print('Event skipping complete.', time=True) pr.print('Resuming XML leg/vehicle event parsing.', time=True) pr.print(f'Event Parsing Progress', progress=time / 86400, persist=True, replace=True, frmt='bold') continue if etype == 'entered link': time = int(float(elem.attrib['time'])) leg_evts.append((leg_id, int(elem.attrib['vehicle']), None, links[elem.attrib['link']], time, 1)) bin_count += 1 leg_id += 1 elif etype == 'left link': time = int(float(elem.attrib['time'])) leg_evts.append((leg_id, int(elem.attrib['vehicle']), None, links[elem.attrib['link']], time, 0)) bin_count += 1 leg_id += 1 elif etype == 'PersonEntersVehicle': time = int(float(elem.attrib['time'])) veh_evts.append((veh_id, int(elem.attrib['vehicle']), int(elem.attrib['person']), time, 1)) bin_count += 1 veh_id += 1 elif etype == 'PersonLeavesVehicle': time = int(float(elem.attrib['time'])) veh_evts.append((veh_id, int(elem.attrib['vehicle']), int(elem.attrib['person']), time, 0)) bin_count += 1 veh_id += 1 if bin_count >= bin_size: total_count += bin_size pr.print(f'Pushing {bin_count} events to SQL database.', time=True) self.database.write_leg_evts(leg_evts) self.database.write_veh_evts(veh_evts) root.clear() leg_evts = [] veh_evts = [] bin_count = 0 pr.print(f'Resuming XML leg/vehicle event parsing.', time=True) pr.print(f'Event Parsing Progress', progress=time / 86400, persist=True, replace=True, frmt='bold') total_count += bin_size pr.print(f'Pushing {bin_count} events to SQL database.', time=True) self.database.write_leg_evts(leg_evts) self.database.write_veh_evts(veh_evts) pr.print(f'Event Parsing Progress', progress=1, persist=True, replace=True, frmt='bold') pr.push() pr.print('XML leg/vehicle event parsing complete.', time=True) pr.print(f'A total of {total_count} events were parsed.', time=True)
def extract_definitions(): # input_file = '/Users/vasanthi/Desktop/THESIS/COLL_COMPUTE/COLO2/SCRIPTS/xml_files/gcide_entries.xml' input_file = cache_abs_path('gcide_entries.xml') #tree = et.parse(input_file) #root = tree.getroot() """ # To print the root node and it's attributes; as well as child noes and their attributes print "root:\n\ttag: %s\n\tattributes: %s" %(root.tag, root.attrib) print "\nChild tags:" for child in root: print "\n\ttag: %s\n\tattributes: %s" %(child.tag, child.attrib) print '______________\n' """ webster_dictionary = defaultdict( list ) # A dictionary to store the list of definitions of each of the word after they are returned. key_list = [] definitions_list = [] #for node in tree.iter(): #for event, node in iterparse(input_file): # get an iterable #context = iterparse(input_file, events=("start", "end")) # turn it into an iterator #context = iter(context) # get the root element #event, root = context.next() #for event, node in context: for event, node in iterparse(input_file): #if (event == 'end' and node.tag == 'entry'): if (node.tag == 'entry'): key = node.attrib.get('key') #print key key_list.append(key) for child in node.iter(): if (child.tag == 'def'): definition = (''.join(itertext(child))) #print definition definitions_list.append(child.text) #definitions = child.text if not (definition == None): definition = definition.replace("; as", " as") definition = definition.replace("; --", " ").strip('\n') list_of_definitions = definition.split( ';' ) # As definitions are separated by a semi-colon in the gcide_entries.xml file for each_definition in list_of_definitions: # Non-Ascii characters are present in these definitions which are breaking the execution # Deleting all of those non-ascii characters ascii_text = '' ascii_text = ascii_text + ''.join( i for i in each_definition if ord(i) < 128) ascii_text = ascii_text.rstrip(' ').rstrip('\n') webster_dictionary[key].append( ascii_text ) # Each of the definition is separately appended to the dictionary node.clear() #root.clear() #print(len(key_list), len(definitions_list), len(webster_dictionary)) return webster_dictionary
from xml.etree.ElementTree import iterparse depth = 0 prefix_width = 8 prefix_dots = '.' * prefix_width line_template = ''.join([ '{prefix:<0.{prefix_len}}', '{event:<8}', '{suffix:<{suffix_len}} ', '{node.tag:<12} ', '{node_id}', ]) EVENT_NAMES = ['start', 'end', 'start-ns', 'end-ns'] for (event, node) in iterparse('podcasts.opml', EVENT_NAMES): if event == 'end': depth -= 1 prefix_len = depth * 2 print(line_template.format( prefix=prefix_dots, prefix_len=prefix_len, suffix='', suffix_len=(prefix_width - prefix_len), node=node, node_id=id(node), event=event, ))
class XMLNamespaces: def __init__(self, **kwargs): self.namespaces = {} for name, uri in kwargs.items(): self.register(name, uri) def register(self, name, uri): self.namespaces[name] = '{' + uri + '}' def __call__(self, path): return path.format_map(self.namespaces) doc = parse('data/sample.xml') ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml') e = doc.find(ns('content/{html}html')) print(e) text = doc.findtext(ns('content/{html}html/{html}head/{html}title')) print(text) print() # you can get a bit more information about the scope of namespace processing # if you use the iterparse() function for evt, elem in iterparse('data/sample.xml', ('end', 'start-ns', 'end-ns')): print(evt, elem)
def extractArticles(filename, collection, articlesNeeded=float('inf')): # Initialize variables. currentState = NOPAGE articleDict = {} skipArticle = False stats = {"numStored": 0, "numSkipped": 0} lastId = collection.find().sort([("_id", pymongo.DESCENDING) ]).limit(1)[0]["_id"] # Loop through every tag in the document. doc = iter(iterparse(filename, ('start', 'end'))) _, root = doc.next() for event, elem in doc: if event == 'start': extractedTag = extractTag(elem.tag) # Tags informing state. if currentState == NOPAGE and extractedTag == "page": currentState += 1 if currentState == INPAGE and extractedTag == "revision": currentState += 1 elif event == 'end': # Parse XML end events extractedTag = extractTag(elem.tag) # Tags informing state. if extractedTag == "page": # Update stats and potentially save article. if skipArticle or "text" not in articleDict or "title" not in articleDict: stats["numSkipped"] += 1 else: collection.insert_one(articleDict) stats["numStored"] += 1 if stats["numStored"] >= articlesNeeded: return stats # Report progress. if not skipArticle and stats["numStored"] % 1000 == 0: print "Stored {} articles so far...".format( stats["numStored"]) sys.stdout.flush() elif skipArticle and stats["numSkipped"] % 1000 == 0: print "Skipped {} articles so far...".format( stats["numSkipped"]) sys.stdout.flush() # Page ended, reset state. currentState = NOPAGE articleDict.clear() skipArticle = False # Clean memory. root.clear() elif extractedTag == "revision": currentState -= 1 # Skip further processing if skipping article. if skipArticle: continue # Tags producing information. if extractedTag == "title": articleDict["title"] = elem.text if not articleDict["title"] or re.match( "[^ ]*:", articleDict["title"]): skipArticle = True elif currentState == INPAGE and extractedTag == "id": articleDict["_id"] = long(elem.text) if articleDict["_id"] <= lastId: skipArticle = True elif extractedTag == "timestamp": articleDict["timestamp"] = elem.text elif extractedTag == "text": articleDict["text"] = elem.text if not articleDict["text"] or articleDict["text"].startswith( "#REDIRECT"): skipArticle = True # Ran out of articles, return some stats return stats
def run(self, config): pr.print('Prallocating process files and tables.', time=True) force = config['run']['force'] self.create_tables('links', 'nodes', force=force) pr.print(f'Loading process metadata and resources.', time=True) network_path = config['run']['network_file'] bin_size = config['run']['bin_size'] if network_path.split('.')[-1] == 'gz': network_file = gzip.open(network_path, mode='rb') else: network_file = open(network_path, mode='rb') parser = iter(iterparse(network_file, events=('start', 'end'))) evt, root = next(parser) links = [] nodes = [] count = 0 for evt, elem in parser: if evt == 'start': if elem.tag == 'nodes': pr.print('Starting road node parsing.', time=True) elif elem.tag == 'links': pr.print( f'Pushing {count % bin_size} nodes to the ' 'database.', time=True) self.database.write_nodes(nodes) nodes = [] root.clear() count = 0 pr.print('Starting road link parsing.', time=True) elif evt == 'end': if elem.tag == 'node': nodes.append((str(elem.get('id')), f'POINT({elem.get("x")} {elem.get("y")})')) count += 1 if count % bin_size == 0: pr.print( f'Pushing {bin_size} nodes to ' 'the database.', time=True) self.database.write_nodes(nodes) nodes = [] root.clear() pr.print(f'Continuing nodes parsing.', time=True) elif elem.tag == 'link': links.append( (str(elem.get('id')), str(elem.get('from')), str(elem.get('to')), float(elem.get('length')), float(elem.get('freespeed')), float(elem.get('capacity')), float(elem.get('permlanes')), int(elem.get('oneway')), str(elem.get('modes')))) count += 1 if count % bin_size == 0: pr.print( f'Pushing {bin_size} links to ' 'the database.', time=True) self.database.write_links(links) links = [] root.clear() pr.print(f'Continuing link parsing.', time=True) if count % bin_size != 0: pr.print(f'Pushing {count % bin_size} links to the database.', time=True) self.database.write_links(links) links = [] root.clear() network_file.close() pr.print('Network road parsing complete.', time=True) if config['run']['create_idxs']: pr.print(f'Creating indexes for module tables.', time=True) self.create_idxs() pr.print(f'Index creation complete.', time=True)
def parse(data): for event, elem in iterparse(data): if elem.tag in TAGS_I_CARE_ABOUT: yield elem elem.clear()
# encoding: utf-8 # # Copyright (c) 2010 Doug Hellmann. All rights reserved. # """Show the events encountered while processing an XML input """ #end_pymotw_header from xml.etree.ElementTree import iterparse depth = 0 prefix_width = 8 prefix_dots = '.' * prefix_width line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}' for (event, node) in iterparse('podcasts.opml', ['start', 'end', 'start-ns', 'end-ns']): if event == 'end': depth -= 1 prefix_len = depth * 2 print line_template.format( prefix=prefix_dots, prefix_len=prefix_len, suffix='', suffix_len=(prefix_width - prefix_len), node=node, node_id=id(node), event=event, )
def parse(self, filepath, bin_size=100000): pr.print(f'Beginning XML input plan parsing from {filepath}.', time=True) pr.print('Plan parsing progress:', progress=0, persist=True, frmt='bold') # XML parser parser = iterparse(filepath, events=('start', 'end')) parser = iter(parser) evt, root = next(parser) # bin counter (total plans processed) bin_count = 0 total_count = 0 # tabular data plans = [] activities = [] routes = [] # indexes agent = 0 route = 0 activity = 0 # other important info modes = set() # ireate over XML tags for evt, elem in parser: if evt == 'start': if elem.tag == 'person': agent = int(elem.attrib['id']) if elem.tag == 'plan': if elem.attrib['selected'] != 'yes': selected = False else: selected = True elif evt == 'end' and selected: if elem.tag == 'plan': plans.append([ # PLANS agent, # agent_id route + activity, # size len(modes) # mode_count ]) modes = set() route = 0 activity = 0 bin_count += 1 if bin_count >= bin_size: pr.print(f'Pushing {bin_count} plans to SQL server.', time=True) self.database.write_plans(plans) self.database.write_activities(activities) self.database.write_routes(routes) root.clear() plans = [] activities = [] routes = [] total_count += bin_count bin_count = 0 pr.print('Resuming XML input plan parsing.', time=True) pr.print('Plan parsing progress:', progress=total_count / 2947013, persist=True, frmt='bold') elif elem.tag == 'act': end_time = self.parse_time(elem.attrib['end_time']) dur_time = end_time if 'dur' not in elem.attrib else self.parse_time( elem.attrib['dur']) act_type = self.encoding['activity'][elem.attrib['type']] activities.append([ # ACTIVITIES agent, # agent_id activity, # act_index end_time - dur_time, # start_time end_time, # end_time act_type, # act_type elem.attrib['x'], # x elem.attrib['y'], # y None # maz ]) activity += 1 elif elem.tag == 'leg': dep_time = self.parse_time(elem.attrib['dep_time']) dur_time = self.parse_time(elem.attrib['trav_time']) mode = self.encoding['mode'][elem.attrib['mode']] modes.add(mode) routes.append([ # ROUTES agent, # agent_id route, # route_index dep_time, # dep_time dur_time, # dur_time mode, # mode None, # src_maz None # term_maz ]) route += 1 pr.print(f'Pushing {bin_count} plans to SQL server.', time=True) pr.print('Plan parsing progress:', progress=1, persist=True, frmt='bold') self.database.write_plans(plans) self.database.write_activities(activities) self.database.write_routes(routes) pr.print('Completed XML input plan parsing.', time=True) root.clear() plans = [] activities = [] routes = []
# works print(doc.findtext('content/{http://www.w3.org/1999/xhtml}html/head/title')) # doesn't work print( doc.findtext( 'content/{http://www.w3.org/1999/xhtml}html/' '{http://www.w3.org/1999/xhtml}head/{http://www.w3.org/1999/xhtml}title' )) # works class XMLNamespaces: def __init__(self, **kwargs): self.namespaces = {} for name, uri in kwargs.items(): self.register(name, uri) def register(self, name, uri): self.namespaces[name] = '{' + uri + '}' def __call__(self, path): return path.format_map(self.namespaces) ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml') print(doc.find(ns('content/{html}html'))) print(doc.findtext(ns('content/{html}html/{html}head/{html}title'))) for evt, elem in iterparse('st7.xml', ('end', 'start-ns', 'end-ns')): print(evt, elem)
if roots.find('language') is not None: roots.remove(roots.find('language')) roots.getchildren().index(roots.find('description')) # oops,fail to insert. el = Element('spam') el.text = "this is a test" roots.insert(2, el) # ns = XMLNammespaces(html="http://purl.org/dc/elements/1.1/") # ht = doc.find(ns('content/{html}/html')) # print(ht) # title =doc.findtext(ns('content/{html}html/{html}head/{html}title')) # print(title) print("============read xml by iterparse============") for evt, elem in iterparse(filepath, ('end', 'start-ns', 'end-ns')): print(evt, elem) print("============read xml============") doc = parse(u) print(doc) e = doc.find('channel/link') print(e.get('title')) print("e.tag:{},e.text:{}".format(e.tag, e.text)) print("============for loop============") i = 0 for item in doc.iterfind("channel/item"): title = item.findtext("title") date = item.findtext("pubDate")