def download_db(self): """ parse HMDB to Pandas.DataFrame """ out_dir = os.path.join(self.tmp_dir, 'HMDB') if not os.path.exists(out_dir): os.mkdir(out_dir) if len(os.listdir(out_dir)) == 0: self._unzip_hmdb(out_dir) logger.info("Parsing metabolites information from files") df = pd.DataFrame([ self._create_dict(e) for ev, e in iter( ElementTree.iterparse(os.path.join(out_dir, os.listdir(out_dir)[0]), events=("start", "end"))) if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end' ]) for i in categories: if i in df.columns: df[i.split('}')[1]] = df[i] del df[i] df.to_csv(self.out_name, index=False, encoding='utf-8', compression='gzip', tupleize_cols=True) logger.info("Done processing HMDB")
def Parse(self, search_file, table_name, file_prefix=None): """Parser entry point. Parses the given POI file to POI elements, based on POI elements builds SQL statements for creating and populating POI table in POI database and triggers the DB updater to implement these SQL statements. Args: search_file: string containing absolute path to .poi file to be parsed. table_name: string containing name to use for POI table creation. Returns: num_fields: number of fields in search schema. sql_search: string containing SQL statement to execute for POI query. balloon_style: string containing associated balloon style. Raises: exceptions.SearchSchemaParserException exception. psycopg2.Warning/Error exceptions. """ self._table_name = table_name self._file_prefix = file_prefix logger.info("Ingesting POI file %s into parser...", search_file) if file_prefix is None: logger.info("File prefix is None") else: logger.info("File prefix is '%s'", file_prefix) self.__StartDocument() try: context = ET.iterparse(search_file, SearchSchemaParser.EVENTS) except ET.ParseError, e: row, column = e.position raise exceptions.SearchSchemaParserException( "Unable to parse POI file %s." " A parsing error on row %d column %d: %s" % ( search_file, row, column, e))
def parse(self): tree = ET.iterparse(self.file) current_element = self.new_element() for event, elem in tree: if event == 'end': if elem.tag == 'attribute': self.attribute_text_to_data(elem, current_element) elif elem.tag == 'object': if 'operation' in elem.attrib: current_element['operation'] = elem.attrib['operation'] if 'name' in elem.attrib and 'ucmdb_id' in elem.attrib: current_element['name'] = elem.attrib['name'] current_element['ucmdb_id'] = elem.attrib['ucmdb_id'] self.components[ current_element['ucmdb_id']] = current_element current_element = self.new_element() elif elem.tag == 'link': if 'operation' in elem.attrib: current_element['operation'] = elem.attrib['operation'] if 'name' in elem.attrib and 'ucmdb_id' in elem.attrib: current_element['name'] = elem.attrib['name'] current_element['ucmdb_id'] = elem.attrib['ucmdb_id'] if 'DiscoveryID1' in current_element['data']: current_element['source_id'] = current_element[ 'data']['DiscoveryID1'] if 'DiscoveryID2' in current_element['data']: current_element['target_id'] = current_element[ 'data']['DiscoveryID2'] self.relations[ current_element['ucmdb_id']] = current_element current_element = self.new_element() elem.clear()
def iterparse_elements(element_function, file_or_path, **kwargs): """ Applies element_function to each of the sub-elements in the XML file. The passed in function must take at least one element, and an optional list of **kwarg which are relevant to each of the elements in the list: def elem_func(each_elem, **kwargs) Implements the recommended cElementTree iterparse pattern, which is efficient for reading in a file, making changes and writing it again. """ if not hasattr(element_function, '__call__'): return file_path = getattr(file_or_path, 'name', file_or_path) context = iter(iterparse(file_path, events=('start', 'end'))) root = None # Capture root for Memory management # Start event loads child; by the End event it's ready for processing for event, child in context: if root is None: root = child if event == 'end': # Ensures the element has been fully read element_function(child, **kwargs) root.clear() # Descendants will not be accessed again
def list(self, prefix='', start_after=''): log.debug('started with %s, %s', prefix, start_after) keys_remaining = True # Without this, a call to list('foo') would result # in *prefix* being longer than *marker* - which causes # trouble for some S3 implementions (minio). if start_after: marker = self.prefix + start_after else: marker = '' prefix = self.prefix + prefix while keys_remaining: log.debug('requesting with marker=%s', marker) keys_remaining = None resp = self._do_request('GET', '/', query_string={ 'prefix': prefix, 'marker': marker, 'max-keys': 1000 }) if not XML_CONTENT_RE.match(resp.headers['Content-Type']): raise RuntimeError('unexpected content type: %s' % resp.headers['Content-Type']) try: itree = iter(ElementTree.iterparse(self.conn, events=("start", "end"))) (event, root) = next(itree) root_xmlns_uri = self._tag_xmlns_uri(root) if root_xmlns_uri is None: root_xmlns_prefix = '' else: # Validate the XML namespace root_xmlns_prefix = '{%s}' % (root_xmlns_uri, ) if root_xmlns_prefix != self.xml_ns_prefix: log.error('Unexpected server reply to list operation:\n%s', self._dump_response(resp, body=None)) raise RuntimeError('List response has %s as root tag, unknown namespace' % root.tag) for (event, el) in itree: if event != 'end': continue if el.tag == root_xmlns_prefix + 'IsTruncated': keys_remaining = (el.text == 'true') elif el.tag == root_xmlns_prefix + 'Contents': marker = el.findtext(root_xmlns_prefix + 'Key') yield marker[len(self.prefix):] root.clear() except GeneratorExit: # Need to read rest of response self.conn.discard() break if keys_remaining is None: raise RuntimeError('Could not parse body')
def iter_elements_by_name(handle, name: str): events = cElementTree.iterparse(handle, events=( "start", "end", )) _, root = next(events) # pylint: disable=stop-iteration-return for event, elem in events: if event == "end" and elem.tag == name: yield elem root.clear()
def download_db(self): """ parse HMDB to Pandas.DataFrame """ out_dir = os.path.join(self.tmp_dir, 'HMDB') if not os.path.exists(out_dir): os.mkdir(out_dir) if len(os.listdir(out_dir)) == 0: self._unzip_hmdb(out_dir) # count = 0 print("Parsing metabolites information from files") # tmp_all = [None] * 1000000 for i in os.listdir(out_dir): # turn file into an iteratorET # context = iter(ElementTree.iterparse(os.path.join(out_dir, i), # events=("start-ns", "end"))) # # # get the root element # _, _ = next(context) # # ElementTree.Element.keys() # for ev, e in context: # if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end': # tmp_all[count] = self._create_dict(e) # count += 1 # # if count == 100: # # break context = iter(ElementTree.iterparse(os.path.join(out_dir, i), events=("start", "end"))) # # # get the root element _, _ = next(context) tmp_all = [ self._create_dict(e) for ev, e in context if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end' ] # if count > 1000000: # print("Need to allocate more space for HMDB data") # df = pd.DataFrame(tmp_all[:count]) df = pd.DataFrame(tmp_all) for i in categories: if i in df.columns: df[i.split('}')[1]] = df[i] del df[i] print(df.head(10)) for i in df.columns: print(i, df[i].head(10)) df.to_csv(self.out_name, index=False, encoding='utf-8', compression='gzip', tupleize_cols=True) print("Done processing HMDB")
def is_svg(fp): fp.seek(0) tag = None try: for event, el in elementtree.iterparse(fp, ('start',)): tag = el.tag break except elementtree.ParseError: pass fp.seek(0) return tag == '{http://www.w3.org/2000/svg}svg'
def list(self, prefix='', start_after=''): log.debug('started with %s, %s', prefix, start_after) keys_remaining = True marker = self.prefix + start_after prefix = self.prefix + prefix ns_p = self.xml_ns_prefix while keys_remaining: log.debug('requesting with marker=%s', marker) keys_remaining = None resp = self._do_request('GET', '/', query_string={ 'prefix': prefix, 'marker': marker, 'max-keys': 1000 }) if not XML_CONTENT_RE.match(resp.headers['Content-Type']): raise RuntimeError('unexpected content type: %s' % resp.headers['Content-Type']) try: itree = iter(ElementTree.iterparse(self.conn, events=("start", "end"))) (event, root) = next(itree) for (event, el) in itree: if event != 'end': continue if el.tag == ns_p + 'IsTruncated': keys_remaining = (el.text == 'true') elif el.tag == ns_p + 'Contents': marker = el.findtext(ns_p + 'Key') yield marker[len(self.prefix):] root.clear() except Exception as exc: if is_temp_network_error(exc): # We probably can't use the connection anymore self.conn.disconnect() raise except GeneratorExit: # Need to read rest of response self.conn.discard() break if keys_remaining is None: raise RuntimeError('Could not parse body')
# -*- coding: utf-8 -*- import xml.etree.cElementTree as badET import defusedxml.cElementTree as goodET xmlString = "<note>\n<to>Tove</to>\n<from>Jani</from>\n<heading>Reminder</heading>\n<body>Don't forget me this weekend!</body>\n</note>" # unsafe tree = badET.fromstring(xmlString) print(tree) badET.parse("filethatdoesntexist.xml") badET.iterparse("filethatdoesntexist.xml") a = badET.XMLParser() # safe tree = goodET.fromstring(xmlString) print(tree) goodET.parse("filethatdoesntexist.xml") goodET.iterparse("filethatdoesntexist.xml") a = goodET.XMLParser()
def _parse_OVF(self, ovf): """Parses the OVF file Parses the OVF file for specified metadata properties. Interested properties must be specified in ovf-metadata.json conf file. The OVF file's qualified namespaces are removed from the included properties. :param ovf: a file object containing the OVF file :returns: a tuple of disk filename and a properties dictionary :raises RuntimeError: an error for malformed OVF file """ def _get_namespace_and_tag(tag): """Separate and return the namespace and tag elements. There is no native support for this operation in elementtree package. See http://bugs.python.org/issue18304 for details. """ m = re.match(r'\{(.+)\}(.+)', tag) if m: return m.group(1), m.group(2) else: return '', tag disk_filename, file_elements, file_ref = None, None, None properties = {} for event, elem in ET.iterparse(ovf): if event == 'end': ns, tag = _get_namespace_and_tag(elem.tag) if ns in CIM_NS and tag in self.interested_properties: properties[CIM_NS[ns] + '_' + tag] = (elem.text.strip() if elem.text else '') if tag == 'DiskSection': disks = [ child for child in list(elem) if _get_namespace_and_tag(child.tag)[1] == 'Disk' ] if len(disks) > 1: """ Currently only single disk image extraction is supported. FIXME(dramakri): Support multiple images in OVA package """ raise RuntimeError( _('Currently, OVA packages ' 'containing multiple disk are ' 'not supported.')) disk = next(iter(disks)) file_ref = next( value for key, value in disk.items() if _get_namespace_and_tag(key)[1] == 'fileRef') if tag == 'References': file_elements = list(elem) # Clears elements to save memory except for 'File' and 'Disk' # references, which we will need to later access if tag != 'File' and tag != 'Disk': elem.clear() for file_element in file_elements: file_id = next(value for key, value in file_element.items() if _get_namespace_and_tag(key)[1] == 'id') if file_id != file_ref: continue disk_filename = next(value for key, value in file_element.items() if _get_namespace_and_tag(key)[1] == 'href') return (disk_filename, properties)
def list(self, prefix='', start_after=''): log.debug('started with %s, %s', prefix, start_after) keys_remaining = True marker = self.prefix + start_after prefix = self.prefix + prefix while keys_remaining: log.debug('requesting with marker=%s', marker) keys_remaining = None resp = self._do_request('GET', '/', query_string={ 'prefix': prefix, 'marker': marker, 'max-keys': 1000 }) if not XML_CONTENT_RE.match(resp.headers['Content-Type']): raise RuntimeError('unexpected content type: %s' % resp.headers['Content-Type']) try: itree = iter( ElementTree.iterparse(self.conn, events=("start", "end"))) (event, root) = next(itree) root_xmlns_uri = self._tag_xmlns_uri(root) if root_xmlns_uri is None: root_xmlns_prefix = '' else: # Validate the XML namespace root_xmlns_prefix = '{%s}' % (root_xmlns_uri, ) if root_xmlns_prefix != self.xml_ns_prefix: log.error( 'Unexpected server reply to list operation:\n%s', self._dump_response(resp, body=None)) raise RuntimeError( 'List response has %s as root tag, unknown namespace' % root.tag) for (event, el) in itree: if event != 'end': continue if el.tag == root_xmlns_prefix + 'IsTruncated': keys_remaining = (el.text == 'true') elif el.tag == root_xmlns_prefix + 'Contents': marker = el.findtext(root_xmlns_prefix + 'Key') yield marker[len(self.prefix):] root.clear() except Exception as exc: if is_temp_network_error(exc): # We probably can't use the connection anymore self.conn.disconnect() raise except GeneratorExit: # Need to read rest of response self.conn.discard() break if keys_remaining is None: raise RuntimeError('Could not parse body')
import xml.etree.cElementTree as badET import defusedxml.cElementTree as goodET xmlString = "<note>\n<to>Tove</to>\n<from>Jani</from>\n<heading>Reminder</heading>\n<body>Don't forget me this weekend!</body>\n</note>" # unsafe tree = badET.fromstring(xmlString) print(tree) badET.parse('filethatdoesntexist.xml') badET.iterparse('filethatdoesntexist.xml') a = badET.XMLParser() # safe tree = goodET.fromstring(xmlString) print(tree) goodET.parse('filethatdoesntexist.xml') goodET.iterparse('filethatdoesntexist.xml') a = goodET.XMLParser()
def _parse_OVF(self, ovf): """Parses the OVF file Parses the OVF file for specified metadata properties. Interested properties must be specified in ovf-metadata.json conf file. The OVF file's qualified namespaces are removed from the included properties. :param ovf: a file object containing the OVF file :returns: a tuple of disk filename and a properties dictionary :raises RuntimeError: an error for malformed OVF file """ def _get_namespace_and_tag(tag): """Separate and return the namespace and tag elements. There is no native support for this operation in elementtree package. See http://bugs.python.org/issue18304 for details. """ m = re.match(r'\{(.+)\}(.+)', tag) if m: return m.group(1), m.group(2) else: return '', tag disk_filename, file_elements, file_ref = None, None, None properties = {} for event, elem in ET.iterparse(ovf): if event == 'end': ns, tag = _get_namespace_and_tag(elem.tag) if ns in CIM_NS and tag in self.interested_properties: properties[CIM_NS[ns] + '_' + tag] = (elem.text.strip() if elem.text else '') if tag == 'DiskSection': disks = [child for child in list(elem) if _get_namespace_and_tag(child.tag)[1] == 'Disk'] if len(disks) > 1: """ Currently only single disk image extraction is supported. FIXME(dramakri): Support multiple images in OVA package """ raise RuntimeError(_('Currently, OVA packages ' 'containing multiple disk are ' 'not supported.')) disk = next(iter(disks)) file_ref = next(value for key, value in disk.items() if _get_namespace_and_tag(key)[1] == 'fileRef') if tag == 'References': file_elements = list(elem) # Clears elements to save memory except for 'File' and 'Disk' # references, which we will need to later access if tag != 'File' and tag != 'Disk': elem.clear() for file_element in file_elements: file_id = next(value for key, value in file_element.items() if _get_namespace_and_tag(key)[1] == 'id') if file_id != file_ref: continue disk_filename = next(value for key, value in file_element.items() if _get_namespace_and_tag(key)[1] == 'href') return (disk_filename, properties)
# This program searches within an xml file to find information, # in this example the information of file "Users.xml" will be used to implement an SQL statement. # Hope this will useful for you! - ernecto-ca # Imports to read a xml and time import defusedxml.cElementTree as goodET from datetime import datetime # Create the "tree" element of the xml tree = goodET.parse('Users.xml') goodET.iterparse('Users.xml') # Get the information of the xml root = tree.getroot() # To check how long it took now = datetime.now() print(now) # The main part of the sql sql = "INSERT INTO users VALUES (" #try: ***when it will insert in a data base**** for user in root: sql = sql + "'" + str(user.get('Id')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('Reputation')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('CreationDate')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('DisplayName')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('LastAccessDate')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('WebsiteUrl')).replace("'", "´") + "'," sql = sql + "'" + str(user.get('Location')).replace("'", "´").replace(