def ensure_elementtree_imported(verbosity, logfile): global ET, ET_has_iterparse if ET is not None: return if "IronPython" in sys.version: import xml.etree.ElementTree as ET #### 2.7.2.1: fails later with #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923) else: try: import xml.etree.cElementTree as ET except ImportError: try: import cElementTree as ET except ImportError: try: import lxml.etree as ET except ImportError: try: import xml.etree.ElementTree as ET except ImportError: try: import elementtree.ElementTree as ET except ImportError: raise Exception("Failed to import an ElementTree implementation") if hasattr(ET, 'iterparse'): _dummy_stream = BYTES_IO(b'') try: ET.iterparse(_dummy_stream) ET_has_iterparse = True except NotImplementedError: pass if verbosity: etree_version = repr([ (item, getattr(ET, item)) for item in ET.__dict__.keys() if item.lower().replace('_', '') == 'version' ]) print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
def process_map(filename): st = set() for _, element in ET.iterparse(filename): if element.tag in ("tag"): if element.attrib['k']=="cuisine": if element.attrib['v'].lower() in ('coffe_shop','coffee_shop'): element.attrib['v']='coffee' if element.attrib['v'].lower() in ('steak_house','steaks'): element.attrib['v']='steak' if element.attrib['v'].lower() in ('mexican','mexcian_food'): element.attrib['v']='mexican' st.add(element.attrib['v'].lower()) dct={} for item in st: dct[item]=0 for _, element in ET.iterparse(filename): if element.tag in ("tag"): if element.attrib['k']=="cuisine": if element.attrib['v'].lower() in ('coffe_shop','coffee_shop'): element.attrib['v']='coffee' if element.attrib['v'].lower() in ('steak_house','steaks'): element.attrib['v']='steak' if element.attrib['v'].lower() in ('mexican','mexcian_food'): element.attrib['v']='mexican' dct[element.attrib['v'].lower()]=dct[element.attrib['v'].lower()]+1 dctFinal={} for key in dct: if ";" not in key and "," not in key: dctFinal[key]=dct[key] return dctFinal
def _read (self): try: i = ET.iterparse(self.f, ('start', 'end')) except FileNotFoundError: self._create() i = ET.iterparse(self.f, ('start', 'end')) return i
def _GetSheetRows(self, filename): """Parses the contents of the first sheet of an XLSX document. Args: filename (str): The file path of the XLSX document to parse. Returns: list[list[str]]: A list of lists representing the rows of the first sheet. Raises: ValueError: if the sheet cannot be found, or a string cannot be read. """ zip_file = zipfile.ZipFile(filename) # Fail if we can't find the expected first sheet. if self._SHEET1 not in zip_file.namelist(): raise ValueError( 'Unable to locate expected sheet: {0:s}'.format(self._SHEET1)) # Generate a reference table of shared strings if available. strings = [] if self._SHARED_STRINGS in zip_file.namelist(): zip_file_object = zip_file.open(self._SHARED_STRINGS) for _, element in ElementTree.iterparse(zip_file_object): if element.tag.endswith(self._SHARED_STRING_TAG): strings.append(element.text) row = [] rows = [] value = '' zip_file_object = zip_file.open(self._SHEET1) for _, element in ElementTree.iterparse(zip_file_object): if (element.tag.endswith(self._VALUE_STRING_TAG) or element.tag.endswith(self._SHARED_STRING_TAG)): value = element.text if element.tag.endswith(self._COLUMN_TAG): # Grab value from shared string reference table if type shared string. if (strings and element.attrib.get(self._TYPE_ATTRIBUTE) == self._SHARED_STRING_TYPE): try: value = strings[int(value)] except (IndexError, ValueError): raise ValueError( 'Unable to successfully dereference shared string.') row.append(value) # If we see the end tag of the row, record row in rows and reset. if element.tag.endswith(self._ROW_TAG): rows.append(row) row = [] return rows
def process(fn, options): if options.output_dir == '-': outdir = None # use STDOUT else: outdir = make_output_directory(fn, options) if not fn.endswith('.gz'): process_stream(ET.iterparse(fn), fn, outdir, options) else: with gzip.GzipFile(fn) as stream: process_stream(ET.iterparse(stream), fn, outdir, options)
def process_map(filename): st = set() for _, element in ET.iterparse(filename): if element.tag in ("tag"): if element.attrib['k']=="power": st.add(element.attrib['v']) dct={} for item in st: dct[item]=0 for _, element in ET.iterparse(filename): if element.tag in ("tag"): if element.attrib['k']=="power": dct[element.attrib['v']]=dct[element.attrib['v']]+1 return dct
def getDataFromExternal(self, date, progress_callback=None): if self.xmltvType == XMLTVSource.TYPE_LOCAL_FILE: f = FileWrapper(self.xmltvFile) context = ElementTree.iterparse(f, events=("start", "end")) size = f.size else: u = urllib2.urlopen(self.xmltvUrl, timeout=30) xml = u.read() u.close() f = StringIO.StringIO(xml) context = ElementTree.iterparse(f) size = len(xml) return self.parseXMLTV(context, f, size, self.logoFolder, progress_callback)
def process_map(filename): users = set() #create empty set, stores each value once for event, element in ET.iterparse(filename): #looping through elements if 'uid' in element.attrib: #we search for UID attribute users.add(element.attrib['uid']) # we add to set the UID, return users
def process_map(filename): users = set() for _, element in ET.iterparse(filename): if 'user' in element.attrib and element.attrib['user'] not in users: users.add(element.attrib['uid']) return users
def xml_hunt(xml_file): """ Gets list of all XML entries with "filename" attribute, and returns a dictionary of the file attributes keyed by a ":"-joined string of parent names. """ root = ET.iterparse(xml_file, events=("start", "end")) parents = [] matches = {} for event, element in root: if element.tag not in ["folder", "file"]: # skip topmost categories continue if element.tag == "folder": if event == "start": # add to parents parents.append(element.attrib["name"]) elif event == "end": # strip from parents del parents[-1] continue if event == "start" and element.tag == "file": parent_string = ":".join(parents) try: matches[parent_string].append(element.attrib) except KeyError: matches[parent_string] = [element.attrib] return matches
def reports(request): for element in ET.iterparse(request): # do stuff to parse this element # save a models.Report pass # return list of file hashes we need uploaded return HttpResponse()
def read_xml_input(inputfile,outputfile): output = [] tree = ET.iterparse(inputfile) for event, elem in tree: if event == "end" and elem.tag == "Sentence": story = elem # Check to make sure all the proper XML attributes are included attribute_check = [key in story.attrib for key in ['date', 'id', 'sentence', 'source']] if not attribute_check: print('Need to properly format your XML...') break text = story.find('Text').text text = text.replace('\n', ' ').replace(' ', ' ').strip() output.append(text+"\n") elem.clear() ofile = open(outputfile,'w') for line in output: ofile.write(line.encode('utf8')) ofile.close()
def main(): parser = argparse.ArgumentParser(description='Get file paths for input/output') parser.add_argument('--in', dest='input_file', required=True) parser.add_argument('--out', dest='output_file') args = parser.parse_args() input_file = args.input_file if not args.output_file: output_file = input_file[:input_file.rfind('.')] + ".json" # convert XML to json with open(output_file, 'a') as f: # parse input XML file for event, elem in Et.iterparse(input_file): if elem.tag == "row" and '_uuid' in elem.attrib: this_dict = dict() # Add _address this_dict['source_uri'] = elem.attrib['_address'] for child in elem.findall('*'): if child.tag == "location_1": if 'latitude' in child.attrib: this_dict['latitude'] = child.attrib['latitude'] if 'longitude' in child.attrib: this_dict['longitude'] = child.attrib['longitude'] else: this_dict[child.tag] = child.text # Write record to file as JSON json.dump(this_dict, f) print('', file=f) # this helps reduce mem usage but more can be done (see http://effbot.org/zone/element-iterparse.htm) elem.clear()
def count_tags(filename): tags={} for evt,elem in ET.iterparse(filename): if(tags.has_key(elem.tag)): tags[elem.tag]=tags[elem.tag]+1 else: tags[elem.tag]=0
def parse_repomd_xml(repomd_file_path, skip_data_types=None): skip_data_types = skip_data_types or [] if not os.access(repomd_file_path, os.F_OK | os.R_OK): return {} xml_parser = ElementTree.iterparse(repomd_file_path, events=("end",)) xml_iterator = iter(xml_parser) data_type_dict = {} for event, element in xml_iterator: if element.tag != _DATA_TAG: continue if element.attrib["type"] in skip_data_types: continue data_type = copy.deepcopy(_DATA_SKEL) data_type["data_type"] = element.attrib["type"] location_element = element.find(_LOCATION_TAG) if location_element is not None: data_type["relative_path"] = location_element.attrib["href"] checksum_element = element.find(_CHECKSUM_TAG) if checksum_element is not None: data_type["checksum_type"] = checksum_element.attrib["type"] data_type["checksum"] = checksum_element.text data_type_dict[data_type["data_type"]] = data_type return data_type_dict
def getDataFromExternal(self, date, progress_callback=None): f = FileWrapper(self.xmltvFile) context = ElementTree.iterparse(f, events=("start", "end")) size = f.size return self.parseXMLTV(context, f, size, self.logoFolder, progress_callback)
def __iter__(self): if self.is_debug: fname = self.protxml + '.dump' logging.debug('Dumping protxml reads into ' + fname) self.debug_file = open(fname, 'w') self.debug_file.write('{\n') for event, elem in etree.iterparse(self.protxml, events=('end', 'start-ns')): if event == 'start-ns': self.nsmap.update({elem}) if event == 'end': if elem.tag == parse.fixtag('', 'protein_group', self.nsmap): group = parse_protein_group(elem, self.nsmap) yield group if self.is_debug: pprint(group, stream=self.debug_file) self.debug_file.write(',\n') elem.clear() elif elem.tag == parse.fixtag('', 'proteinprophet_details', self.nsmap): self.distribution = parse_protein_probabilities(elem, self.nsmap) if self.is_debug: fname = self.protxml + '.distribution.dump' pprint(self.distribution, open(fname, 'w')) elem.clear() if self.is_debug: self.debug_file.write('}\n') self.debug_file.close()
def process_map(filename): """ Takes in an OSM file, and print information that is useful for auditing Calls a function that outputs a text file with all keys and a set of their distinct values """ tag_dictionary = {} entry_counts= {} street_types = {} keys_to_inspect = {"lower_colon": set(), "problemchars": set(), "other": set()} key_counts = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for event, element in ET.iterparse(filename, events=("start",)): # Update key_counts, keys_to_inspect key_counts, keys_to_inspect= key_type(element, key_counts, keys_to_inspect) #Update tag_dcitionary, entry_counts, street_types tag_dictionary, entry_counts, street_types = build_dictionary(element, tag_dictionary, entry_counts, street_types) # Create text file and print report create_text_file(tag_dictionary, "tag_key_values") print_report(key_counts, keys_to_inspect, entry_counts, street_types) return
def __init__(self, file): """Initialize the class.""" # Get an iterable context for XML parsing events context = iter(ElementTree.iterparse(file, events=('start', 'end'))) event, root = next(context) self.root = root self.context = context
def process_map(filename): users = set() # the following lines allow us # to access the root element from within # the iterator in order to clear the memory... ######## ######## # get an iterable context = ET.iterparse(filename, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.next() ######## ######## # iterate through the elements, # aggregating each distinct user # into the users set. for _,element in context: if 'uid' in element.attrib.keys(): user = element.attrib['uid'] if user in users: continue else: users.add(user) pass root.clear() return users
def process_map(filename): users = set() for _, element in ET.iterparse(filename): if "uid" in element.attrib: users.add(element.attrib["uid"]) return users
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for _, element in ET.iterparse(filename): keys = key_type(element, keys) return keys
def do_parse(opt, filename): ls = subprocess.Popen([ djvutoxml, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) page_nr = 1 for event, elem in etree.iterparse(XmlFile(ls.stdout)): if elem.tag.lower() == 'object': page = OcrPage() if not opt.silent: print >> sys.stderr, page_nr, '\r', page.start_page(elem) parse_page(page, elem, page_nr) page.end_page(elem) filename = opt.out_dir + 'page_%04d.html' % page_nr if opt.compress: text = page.get_hocr_html().encode('utf-8') utils.compress_file_data(filename, text, opt.compress) else: utils.write_file(filename, text) elem.clear() page_nr += 1 if not opt.silent: print >> sys.stderr ls.wait() return ls.returncode
def __init__(self, filename): print >> log.v4, "Loading lexicon", filename lex_file = open(filename, 'rb') if filename.endswith(".gz"): lex_file = gzip.GzipFile(fileobj=lex_file) self.phonemes = {} self.lemmas = {} context = iter(etree.iterparse(lex_file, events=('start', 'end'))) _, root = next(context) # get root element tree = [root] for event, elem in context: if event == "start": tree += [elem] elif event == "end": assert tree[-1] is elem tree = tree[:-1] if elem.tag == "phoneme": symbol = elem.find("symbol").text.strip() # should be unicode variation = elem.find("variation").text.strip() assert symbol not in self.phonemes assert variation in ["context", "none"] self.phonemes[symbol] = {"index": len(self.phonemes), "symbol": symbol, "variation": variation} root.clear() # free memory elif elem.tag == "phoneme-inventory": print >> log.v4, "Finished phoneme inventory, %i phonemes" % len(self.phonemes) root.clear() # free memory elif elem.tag == "lemma": orth = elem.find("orth").text.strip() phons = [{"phon": e.text.strip(), "score": float(e.attrib.get("score", 0))} for e in elem.findall("phon")] assert orth not in self.lemmas self.lemmas[orth] = {"orth": orth, "phons": phons} root.clear() # free memory print >> log.v4, "Finished whole lexicon, %i lemmas" % len(self.lemmas)
def from_config_xml(config_xml): if not path.isfile(config_xml): raise Exception("Not a file: {}".format(config_xml)) # Workaround for ET stripping some namespaces events = ("start", "start-ns", "end-ns") namespace_mapping = {} namespaces = [] root = None for event, elem in Et.iterparse(config_xml, events=events): if event == "start-ns": namespaces.append(elem) elif event == "start": namespace_mapping[elem] = dict(namespaces) namespaces = [] # Find root while we are here if elem.tag == "snapshot": root = elem # Empty if root is None: return OdlConfig([], []) modules_elements = root.findall(OdlConfig.MODULE_PATH, OdlConfig.NAMESPACES) modules = OdlConfig.__parse_modules(modules_elements, namespace_mapping) services_elements = root.findall(OdlConfig.SERVICE_PATH, OdlConfig.NAMESPACES) services = OdlConfig.__parse_services(services_elements, namespace_mapping) if not modules and not services: return OdlConfig([], []) return OdlConfig(modules, services)
def process_map(filename): #creates the dictionary keys and calls key_type keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} for _, element in ET.iterparse(filename): keys = key_type(element, keys) element.clear() return keys
def jsonify(file_in, pretty = False): # processes file into JSON file_out = "{0}.json".format(file_in) data = [] with codecs.open(file_out, "w") as fo: for event, element in ET.iterparse(file_in, events=("start",)): if element.tag == "node" or element.tag == "way": for tag in element.iter("tag"): if is_street_name(tag): m = street_type_re.search(tag.attrib['v']) if m: street_name = update_name(tag.attrib['v'], mapping) m = directions_re.search(street_name) if m: street_name = update_direction(street_name, direction_mapping) m = directions_re.search(street_name) tag.set('v', street_name) el = shape_element(element) if el: data.append(el) if pretty: fo.write(json.dumps(el, indent=2)+"\n") else: fo.write(json.dumps(el) + "\n") element.clear() #pprint.pprint(data) return data
def getCategories(self): cat = dict() path = os.path.join(datapath, 'cats.xml') dixie.log("Checking for category XML path at: "+path) try: if sfile.exists(path): xml = sfile.read(path) except: dixie.log("### cats.xml does not exist") # xml = xml.replace('&', '&') xml = StringIO.StringIO(xml) xml = ElementTree.iterparse(xml, events=("start", "end")) for event, elem in xml: try: if event == 'end': if elem.tag == 'cats': channel = elem.findtext('channel') category = elem.findtext('category') if channel != '' and category != '': cat[channel] = category except: pass return cat
def parse(self, xml_file, from_string=False): """Import .nessus file""" # Parse XML file if from_string: xml_file = StringIO(xml_file) # Iterate through each host scanned and create objects for each for event, elem in ET.iterparse(xml_file): # Grab the report name from the Report element if event == "end" and elem.tag == "Report": self.name = elem.attrib.get("name") continue # Only process ReportHost elements elif event == "end" and elem.tag != "ReportHost": continue rh_obj = ReportHost(elem) if rh_obj: self.targets.append(rh_obj) # Update Report dates if not self.scan_start and rh_obj.get("host_start"): self.scan_start = rh_obj.host_start if not self.scan_end: self.scan_end = rh_obj.host_end if rh_obj.get("host_start"): if rh_obj.host_start < self.scan_start: self.scan_start = rh_obj.host_start if rh_obj.host_end > self.scan_end: self.scan_end = rh_obj.host_end
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): """Generator function to parse UniProt XML as SeqRecord objects. parses an XML entry at a time from any UniProt XML file returns a SeqRecord for each iteration This generator can be used in Bio.SeqIO return_raw_comments = True --> comment fields are returned as complete XML to allow further processing skip_parsing_errors = True --> if parsing errors are found, skip to next entry """ if isinstance(alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if isinstance(alphabet, Alphabet.Gapped): if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if not hasattr(handle, "read"): if isinstance(handle, str): handle = StringIO(handle) else: raise Exception('An XML-containing handler or an XML string must be passed') if ElementTree is None: from Bio import MissingExternalDependencyError raise MissingExternalDependencyError( "No ElementTree module was found. " "Use Python 2.5+, lxml or elementtree if you " "want to use Bio.SeqIO.UniprotIO.") for event, elem in ElementTree.iterparse(handle, events=("start", "end")): if event == "end" and elem.tag == NS + "entry": yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() elem.clear()
def check_street(filename): for event, elem in ET.iterparse(filename): if is_street_name(elem): audit_street_type(street_types, elem.attrib['v']) print(street_types, "%s: %d") return (street_types)
def check_amenity(filename): for event, elem in ET.iterparse(filename): if is_amenity_name(elem): audit_amenity_type(amenity_types, elem.attrib['v']) print(amenity_types, "%s: %d") return (amenity_types)
#!/Applications/QGIS3.14.app/Contents/MacOS/bin/python3 #!/bin/env python3 # # # (c) 2020 - Alessandro Frigeri, Istituto Nazionale di Astrofisica # # symbolsplitter - splits qgis xml style file into n xml files containing one symbol each # import xml.etree.ElementTree as ET import sys, string infile = sys.argv[1] context = ET.iterparse(infile, events=('end', 'start')) depth = 0 for event, elem in context: if elem.tag == 'symbol': if event == 'end': depth -= 1 if event == 'start': depth += 1 name = elem.get('name') if depth == 0: filename = format(name.split(':')[0] + ".xml") with open(filename, 'wb') as f: #f.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") f.write(b"<qgis_style version=1>\n<symbols>\n") f.write(ET.tostring(elem)) f.write(b"\n</symbols>")
''' Created on 1 июня 2016 г. @author: Михаил Булыгин <*****@*****.**> ''' import xml.etree.ElementTree as ET from pg import DB if __name__ == '__main__': file = open("ruwiki.xml", encoding="utf-8") tree = ET.iterparse(file) db = DB(host="localhost", user="******", passwd="1234", dbname="wiki") db.query("TRUNCATE TABLE wiki") for i, line in enumerate(tree): event, element = line if element.tag == "page": pageid = element.find("id").text title = element.find("title").text timestamp = element.find("revision").find( "timestamp").text.replace("T", " ") username = element.find("revision").find("contributor").find( "username") if not username is None: author = username.text else: author = element.find("revision").find("contributor").find( "ip").text text = element.find("revision").find("text").text
q.put((editid, timestamp, articleid, userid, quality, delta_edit, len(text_prev), len(text_final), future_edits)) print("//{}".format(args.xml_file)) # number of processes to use NUMBER_PROCESSES = args.processes # whether should 'cleanly' separate the computations into test/training sets THRESHOLD_TIMESTAMP = args.threshold # time start in seconds time_start = time.time() ## open XML file parse_iterator = ET.iterparse(args.xml_file, events=('end', 'start')) parse_iterator = iter(parse_iterator) # keep track of the root element event, root = next(parse_iterator) rev_count = 0 results = [] with mp.Pool(NUMBER_PROCESSES) as pool: manager = mp.Manager() q = manager.Queue() writer = csv.writer(sys.stdout, delimiter='#') edits11 = [""] ids11 = ["-"] users11 = [-1]
def process_map(filename): """process file iternatively""" users = set() for _, element in ET.iterparse(filename): users.update(get_user(element)) return users
#!/usr/bin/python import re from xml.etree import ElementTree as ET #parser = ET.itparse('10000_new') for event, element in ET.iterparse('10000_new', events=('start', 'end')): if element.tag == 'doc': print(element.text)
def parse(self, values_are_confidence=False, rooted=False): """Parse the text stream this object was initialized with.""" nexml_doc = ElementTree.iterparse(self.handle, events=("end",)) for event, node in nexml_doc: if node.tag == qUri("nex:tree"): node_dict = {} node_children = {} root = None child_tags = node.getchildren() nodes = [] edges = [] for child in child_tags: if child.tag == qUri("nex:node"): nodes.append(child) if child.tag == qUri("nex:edge"): edges.append(child) for node in nodes: node_id = node.attrib["id"] this_node = node_dict[node_id] = {} if "otu" in node.attrib and node.attrib["otu"]: this_node["name"] = node.attrib["otu"] if "root" in node.attrib and node.attrib["root"] == "true": root = node_id for child in node.getchildren(): if child.tag == qUri("nex:meta"): self.add_annotation(node_dict[node_id], child) srcs = set() tars = set() for edge in edges: src, tar = edge.attrib["source"], edge.attrib["target"] srcs.add(src) tars.add(tar) if src not in node_children: node_children[src] = set() node_children[src].add(tar) if "length" in edge.attrib: node_dict[tar]["branch_length"] = float(edge.attrib["length"]) if "property" in edge.attrib and edge.attrib["property"] in matches( "cdao:has_Support_Value" ): node_dict[tar]["confidence"] = float(edge.attrib["content"]) for child in edge.getchildren(): if child.tag == qUri("nex:meta"): self.add_annotation(node_dict[tar], child) if root is None: # if no root specified, start the recursive tree creation function # with the first node that's not a child of any other nodes rooted = False possible_roots = ( node.attrib["id"] for node in nodes if node.attrib["id"] in srcs and node.attrib["id"] not in tars ) root = next(possible_roots) else: rooted = True yield NeXML.Tree( root=self._make_tree(root, node_dict, node_children), rooted=rooted )
def parseMML(mmlinput): exppy = "" # this is the python expression symvars = [ ] # these are symbolic variables which can eventually take part in the expression events = ("start", "end") level = 0 context = ET.iterparse(mmlinput, events=events) for action, elem in context: if (action == 'start') and (elem.tag == 'mfrac'): level += 1 tree = ET.ElementTree(elem[0]) tree.write('output.xml') (a, b) = parseMML('output.xml') symvars.append(b) for index in a: exppy += index exppy += '/' tree = ET.ElementTree(elem[1]) tree.write('output.xml') (a, b) = parseMML('output.xml') symvars.append(b) for index in a: exppy += index if (action == 'end') and (elem.tag == 'mfrac'): level -= 1 if level: continue if (action == 'start') and (elem.tag == 'mrow'): exppy += '(' if (action == 'end') and (elem.tag == 'mrow'): exppy += ')' if action == 'start' and elem.tag == 'msub': # this is a power # level += 1 # tree = ET.ElementTree(elem[0]) # tree.write('output.xml') # (a, b) = parseMML('output.xml') # symvars.append(b) # for index in a: # exppy += '[' # exppy += index # exppy += ']' # exppy += '**' # tree = ET.ElementTree(elem[1]) # tree.write('output.xml') # (a, b) = parseMML('output.xml') # symvars.append(b) # for index in a: # exppy += index level += 1 tree = ET.ElementTree(elem[0]) tree.write('output.xml') (a, b) = parseMML('output.xml') symvars.append(b) for index in a: exppy += '[' exppy += index exppy += ']' exppy += '**' tree = ET.ElementTree(elem[1]) tree.write('output.xml') (a, b) = parseMML('output.xml') symvars.append(b) for index in a: exppy += index # exppy += '' if action == 'start' and elem.tag == 'mn': # this is a number exppy += elem.text if action == 'start' and elem.tag == 'mi': # this is a variable exppy += elem.text symvars.append( elem.text ) # we'll return the variable, so sympy can sympify it afterwards if action == 'start' and elem.tag == 'mo': # this is a operation exppy += elem.text if exppy.startswith('(') and exppy.endswith(')'): exppy = exppy[1:-1] exppyarray = exppy.split("=") # for exppy in exppyarray : # print(exppy) return exppyarray, symvars
def appGmlToMetadataElementDict(gmlPath): """słownik metadataElementDict na podstawie pliku zbioru APP""" metadataElementDict = {} ns = { 'gco': "http://www.isotc211.org/2005/gco", 'app': "https://www.gov.pl/static/zagospodarowanieprzestrzenne/schemas/app/1.0", 'gmd': "http://www.isotc211.org/2005/gmd", 'gml': "http://www.opengis.net/gml/3.2", 'wfs': "http://www.opengis.net/wfs/2.0", 'xlink': "http://www.w3.org/1999/xlink", 'xsi': "http://www.w3.org/2001/XMLSchema-instance" } root = ET.parse(gmlPath) # E1 element = root.find('//app:AktPlanowaniaPrzestrzennego/app:typPlanu', ns) if element is not None: typPlanu = element.attrib['{%s}title' % ns['xlink']].replace( 'miejscowy', 'miejscowych').replace('plan', 'planów').replace( 'kier.', 'kierunków').replace('zagosp.', 'zagospodarowania').replace( 'przestrz.', 'przestrzennego') metadataElementDict['e1'] = { 'e1_lineEdit': "Zbiór danych przestrzennych dla %s <typ_jednostki> <nazwa_jednostki>" % typPlanu } # E5 date = root.find('//app:AktPlanowaniaPrzestrzennego//app:przestrzenNazw', ns) if date is None: utils.showPopup( "Błąd pliku", "wczytany plik nie jest poprawną definicją GML dla zbioru APP. Zwaliduj plik przed wczytaniem do formularza metadanych", QMessageBox.Warning) return False metadataElementDict['e5'] = [{'e5_lineEdit': date.text}] # E7 - kodowanie z nagłówka GML with open(gmlPath, 'r') as file: line = file.readlines(1)[0] line.replace("'", '"') encoding = re.search('encoding="[a-zA-Z0-9\-]{3,10}"', line)[0].split("=")[-1].strip('"').replace( ' ', '').replace('-', '').lower() if encoding == 'usascii': encoding = 'usAscii' metadataElementDict['e7'] = [{'e7_cmbbx': encoding}] # E9, E10 - słowa kluczowe itemsList = [] date = root.find('//app:AktPlanowaniaPrzestrzennego/app:poziomHierarchii', ns) if date is not None: atrybut_title = date.attrib['{%s}title' % ns['xlink']] atrybut_href = date.attrib['{%s}href' % ns['xlink']] tekst = 'Regionalnym' if atrybut_title == 'regionalny' else 'Lokalne' # poziom administracyjny itemsList.append({ 'e9_lineEdit': tekst, 'e10_cmbbx': 'Data opublikowania', 'e10_dateTimeEdit': QDateTime(2019, 5, 22, 0, 0), 'e10_lineEdit': 'Zakres przestrzenny', 'xlink': "http://inspire.ec.europa.eu/metadata-codelist/SpatialScope" }) # poziom jednostki itemsList.append({ 'e9_lineEdit': atrybut_title, 'e10_cmbbx': 'Data opublikowania', 'e10_dateTimeEdit': QDateTime(2013, 12, 10, 0, 0), 'e10_lineEdit': 'Poziom planu zagospodarowania przestrzennego', 'xlink': "http://inspire.ec.europa.eu/codelist/LevelOfSpatialPlanValue" }) # dodanie domyslnych wartosci kluczowych itemsList.extend(dictionaries.metadataListWidgetsDefaultItems['e9']) metadataElementDict['e9'] = itemsList # E11 layer = QgsVectorLayer(gmlPath + '|layername=AktPlanowaniaPrzestrzennego', "gml", 'ogr') # if not layer.isValid(): # sprawdzanie czy AktPlanowaniaPrzestrzennego jest w wfs:member lub gml:featureMember (bezpośrednio) # layer = QgsVectorLayer(gmlPath + '|layername=featureMember', "gml", 'ogr') if layer.isValid(): sourceCrs = layer.crs() extent = layer.extent() # w zwiazku z niepoprawnym zaczytywaniem zasiegu GML przez QGIS - odwrocenie osi ''' Dla wersji QGIS <= 3.14 przy wczytywaniu GML # z definicją układu # jako uri do opengis.net np. http://www.opengis.net/def/crs/EPSG/0/2177 # QGIS wczytuje zasięg z odwróconymi X i Y # TODO: do wykomentowania gdy błąd zostanie naprawiony w nowej wersji programu # dla starych - pozostaje ''' extentInverted = QgsRectangle(extent.yMinimum(), extent.xMinimum(), extent.yMaximum(), extent.xMaximum()) crsDest = QgsCoordinateReferenceSystem(4326) # WGS84 xform = QgsCoordinateTransform(sourceCrs, crsDest, QgsProject.instance()) extent84 = xform.transform(extentInverted) metadataElementDict['e11'] = [{ 'e11_lineEdit': '%s,%s,%s,%s' % (extent84.xMinimum(), extent84.xMaximum(), extent84.yMinimum(), extent84.yMaximum()) }] # E12 itemsList = [] # szukaj w rysunkach APP for uklad in root.findall('//*/app:ukladOdniesieniaPrzestrzennego', ns): if {'e12_cmbbx': uklad.text} not in itemsList: itemsList.append({'e12_cmbbx': uklad.text}) # szukaj w zasięgach APP for multiSurface in root.findall( '//*/app:zasiegPrzestrzenny/gml:MultiSurface', ns): if {'e12_cmbbx': multiSurface.attrib['srsName']} not in itemsList: itemsList.append({'e12_cmbbx': multiSurface.attrib['srsName']}) metadataElementDict['e12'] = itemsList # E13 dates = [] for date in root.findall( '//app:AktPlanowaniaPrzestrzennego/app:poczatekWersjiObiektu', ns): dates.append(QDateTime.fromString(date.text, "yyyy-MM-dd'T'hh:mm:ss")) oldestDate = utils.oldestQDateTime(dates) if oldestDate is not None: metadataElementDict['e13'] = {'e13_dateTimeEdit': oldestDate} # E16 itemsList = [] for rozdzielczosc in root.findall('//*/app:rozdzielczoscPrzestrzenna', ns): if {'e16_lineEdit': rozdzielczosc.text} not in itemsList: itemsList.append({'e16_lineEdit': rozdzielczosc.text}) metadataElementDict['e16'] = itemsList # E18 i E19 i E24 i E25 itemsList = [] inspire1 = "Rozporządzenie Komisji (UE) Nr 1089/2010 z dnia 23 listopada 2010 r. w sprawie wykonania dyrektywy 2007/2/WE Parlamentu Europejskiego i Rady w zakresie interoperacyjności zbiorów i usług danych przestrzennych" inspire2 = "D2.8.III.4 Data Specification on Land Use – Technical Guidelines" krajowy1 = "Rozporządzenie Ministra Rozwoju, Pracy i Technologii z dnia 26 października 2020 r. w sprawie zbiorów danych przestrzennych oraz metadanych w zakresie zagospodarowania przestrzennego" krajowy2 = "Planowanie przestrzenne: Specyfikacja danych" ifKrajowy = False ifInspire = False namespaces = dict( [node for _, node in ET.iterparse(gmlPath, events=['start-ns'])]) for v in namespaces.values(): if 'https://www.gov.pl/static/zagospodarowanieprzestrzenne' in v: ifKrajowy = True # ifInspire = False break if 'http://inspire.ec.europa.eu/schemas/plu/4.0/PlannedLandUse.xsd' in v: # ifKrajowy = False ifInspire = True break # E18 i E19 inspire1 itemsList.append({ 'e18_lineEdit': inspire1, 'e18_dateTimeEdit': QDateTime(2010, 12, 8, 0, 0), 'e18_cmbbx': 'Data opublikowania', 'e19_cmbbx': 'Zgodny (conformant)' if ifInspire else 'Niezgodny (notConformant)', 'xlink': "http://data.europa.eu/eli/reg/2010/1089" }) # E18 i E19 inspire2 itemsList.append({ 'e18_lineEdit': inspire2, 'e18_dateTimeEdit': QDateTime(2013, 12, 10, 0, 0), 'e18_cmbbx': 'Data opublikowania', 'e19_cmbbx': 'Zgodny (conformant)' if ifInspire else 'Niezgodny (notConformant)' }) # E18 i E19 krajowy1 itemsList.append({ 'e18_lineEdit': krajowy1, 'e18_dateTimeEdit': QDateTime(2020, 10, 31, 0, 0), 'e18_cmbbx': 'Data opublikowania', 'e19_cmbbx': 'Zgodny (conformant)' if ifKrajowy else 'Niezgodny (notConformant)', 'xlink': "https://dziennikustaw.gov.pl/DU/2020/1916" }) # E18 i E19 krajowy2 itemsList.append({ 'e18_lineEdit': krajowy2, 'e18_dateTimeEdit': QDateTime(2020, 10, 31, 0, 0), 'e18_cmbbx': 'Data opublikowania', 'e19_cmbbx': 'Zgodny (conformant)' if ifKrajowy else 'Niezgodny (notConformant)', 'xlink': "" # TODO: uaktualnić po publikacji }) metadataElementDict['e18'] = itemsList # E24 i E25 krajowy itemsList = [] if ifKrajowy: itemsList.append({ 'e24_lineEdit': "Schemat aplikacyjny GML Planowanie przestrzenne", 'e25_lineEdit': "1.0" }) if ifInspire: itemsList.append({ 'e24_lineEdit': "Planned Land Use GML Application Schema", 'e25_lineEdit': "4.0" }) metadataElementDict['e24'] = itemsList return metadataElementDict
def getDataFromExternal(self, date, progress_callback = None): f = FileWrapper(self.xmltvFile) context = ElementTree.iterparse(f, events=("start", "end")) return parseXMLTV(context, f, f.size, self.logoFolder, progress_callback)
def parse(self, values_are_confidence=False, rooted=False): """Parse the text stream this object was initialized with.""" nexml_doc = ElementTree.iterparse(self.handle, events=('end', )) for event, node in nexml_doc: if node.tag == qUri('nex:tree'): node_dict = {} node_children = {} root = None child_tags = node.getchildren() nodes = [] edges = [] for child in child_tags: if child.tag == qUri('nex:node'): nodes.append(child) if child.tag == qUri('nex:edge'): edges.append(child) for node in nodes: node_id = node.attrib['id'] this_node = node_dict[node_id] = {} if 'otu' in node.attrib and node.attrib['otu']: this_node['name'] = node.attrib['otu'] if 'root' in node.attrib and node.attrib['root'] == 'true': root = node_id for child in node.getchildren(): if child.tag == qUri('nex:meta'): self.add_annotation(node_dict[node_id], child) srcs = set() tars = set() for edge in edges: src, tar = edge.attrib['source'], edge.attrib['target'] srcs.add(src) tars.add(tar) if src not in node_children: node_children[src] = set() node_children[src].add(tar) if 'length' in edge.attrib: node_dict[tar]['branch_length'] = float( edge.attrib['length']) if 'property' in edge.attrib and edge.attrib[ 'property'] in matches('cdao:has_Support_Value'): node_dict[tar]['confidence'] = float( edge.attrib['content']) for child in edge.getchildren(): if child.tag == qUri('nex:meta'): self.add_annotation(node_dict[tar], child) if root is None: # if no root specified, start the recursive tree creation function # with the first node that's not a child of any other nodes rooted = False possible_roots = (node.attrib['id'] for node in nodes if node.attrib['id'] in srcs and not node.attrib['id'] in tars) root = next(possible_roots) else: rooted = True yield NeXML.Tree(root=self._make_tree(root, node_dict, node_children), rooted=rooted)
def activity_summary(self, file_path): ''' Main XML parsing script, streams in file and proccesses each branch into lists, which get converted to data frames and exported to csv files for database upload ''' # activity data date = [] energy_burned = [] energy_burned_goal = [] energy_burned_unit = [] exercise_time = [] exercise_time_goal = [] stand_hours = [] stand_hours_goal = [] file = file_path + '/apple_health_export/export.xml' # exercise time exercise_time_type = [] exercise_time_date = [] exercise_time_duration = [] exercise_time_durationUnit = [] # workout data workoutActivityType = [] duration = [] durationUnit = [] totalDistance = [] totalDistanceUnit = [] totalEnergyBurned = [] totalEnergyBurnedUnit = [] sourceName = [] sourceVersion = [] device = [] creationDate = [] startDate = [] endDate = [] # heartrate data record_type = [] record_unit = [] record_value = [] record_sourceName = [] record_sourceVersion = [] record_device = [] record_creationDate = [] record_startDate = [] record_endDate = [] for event, elem in ET.iterparse(file, events=("start", "end")): if event == 'end': # process the tag if elem.tag == 'ActivitySummary': # import pdb;pdb.set_trace() for item in elem.items(): if item[0] == 'dateComponents': date.append(item[1]) elif item[0] == 'activeEnergyBurned': energy_burned.append(item[1]) elif item[0] == 'activeEnergyBurnedGoal': energy_burned_goal.append(item[1]) elif item[0] == 'activeEnergyBurnedUnit': energy_burned_unit.append(item[1]) elif item[0] == 'appleExerciseTime': exercise_time.append(item[1]) elif item[0] == 'appleExerciseTimeGoal': exercise_time_goal.append(item[1]) elif item[0] == 'appleStandHours': stand_hours.append(item[1]) elif item[0] == 'appleStandHoursGoal': stand_hours_goal.append(item[1]) if elem.tag == 'WorkoutEvent': for item in elem.items(): if item[0] == 'type': exercise_time_type.append(item[1]) elif item[0] == 'date': exercise_time_date.append(item[1]) elif item[0] == 'duration': exercise_time_duration.append(item[1]) elif item[0] == 'durationUnit': exercise_time_durationUnit.append(item[1]) if elem.tag == 'Workout': for item in elem.items(): if item[0] == 'workoutActivityType': workoutActivityType.append(item[1]) if item[0] == 'duration': duration.append(item[1]) if item[0] == 'durationUnit': durationUnit.append(item[1]) if item[0] == 'totalDistance': totalDistance.append(item[1]) if item[0] == 'totalDistanceUnit': totalDistanceUnit.append(item[1]) if item[0] == 'totalEnergyBurned': totalEnergyBurned.append(item[1]) if item[0] == 'totalEnergyBurnedUnit': totalEnergyBurnedUnit.append(item[1]) if item[0] == 'sourceName': sourceName.append(item[1]) if item[0] == 'sourceVersion': sourceVersion.append(item[1]) if item[0] == 'device': device.append(item[1]) if item[0] == 'creationDate': creationDate.append(item[1]) if item[0] == 'startDate': startDate.append(item[1]) if item[0] == 'endDate': endDate.append(item[1]) ''' if elem.tag == 'Record': for item in elem.items(): if item[0] == 'type': record_type.append(item[1]) if item[0] == 'unit': record_unit.append(item[1]) if item[0] == 'value': record_value.append(item[1]) if item[0] == 'sourceName': record_sourceName.append(item[1]) if item[0] == 'sourceVersion': record_sourceVersion.append(item[1]) if item[0] == 'device': record_device.append(item[1]) if item[0] == 'creationDate': record_creationDate.append(item[1]) if item[0] == 'startDate': record_startDate.append(item[1]) if item[0] == 'endDate': record_endDate.append(item[1]) ''' # this is the key to memory management on the server elem.clear() # create activity data data frame print('Creating activity data...') li = list( zip(date, energy_burned, energy_burned_goal, energy_burned_unit, exercise_time, exercise_time_goal, stand_hours, stand_hours_goal)) df = pd.DataFrame(li, columns=[ 'date', 'energy_burned', 'energy_burned_goal', 'energy_burned_unit', 'exercise_time', 'exercise_time_goal', 'stand_hours', 'stand_hours_goal' ]) # remove dates before 2000-01-01 df['datetime'] = pd.to_datetime(df['date']) df = df[df['datetime'] > '2000-01-01'] # drop datetime column df = df.drop(['datetime'], axis=1) # add created_at, last_updated_by df['created_at'] = pd.to_datetime('now') df['updated_at'] = pd.to_datetime('now') df.fillna(0, inplace=True) # create exercise time data frame print('Creating exercise time data...') li = list( zip(exercise_time_date, exercise_time_type, exercise_time_duration, exercise_time_durationUnit)) exercise_time = pd.DataFrame(li, columns=[ 'date', 'exercise_time_type', 'exercise_time_duration', 'exercise_time_durationUnit' ]) # remove dates before 2000-01-01 exercise_time['datetime'] = pd.to_datetime(exercise_time['date']) exercise_time = exercise_time[exercise_time['datetime'] > '2000-01-01'] # drop datetime column exercise_time = exercise_time.drop(['datetime'], axis=1) # add created_at, last_updated_by exercise_time['created_at'] = pd.to_datetime('now') exercise_time['updated_at'] = pd.to_datetime('now') exercise_time.fillna(0, inplace=True) # create workout data frame print('Creating workout data...') li = list( zip(workoutActivityType, duration, durationUnit, totalDistance, totalDistanceUnit, totalEnergyBurned, totalEnergyBurnedUnit, sourceName, sourceVersion, device, creationDate, startDate, endDate)) workout = pd.DataFrame( li, columns=[ 'activity_type', 'duration', 'duration_unit', 'total_distance', 'total_distance_unit', 'total_energy_burned', 'total_energy_burned_unit', 'source_name', 'source_version', 'device', 'creation_date', 'start_date', 'end_date' ]) # remove dates before 2000-01-01 workout['creation_datetime'] = pd.to_datetime(workout['creation_date']) workout = workout[workout['creation_datetime'] > '2000-01-01'] workout['start_datetime'] = pd.to_datetime(workout['start_date']) workout = workout[workout['start_datetime'] > '2000-01-01'] workout['end_datetime'] = pd.to_datetime(workout['end_date']) workout = workout[workout['end_datetime'] > '2000-01-01'] workout['date'] = workout['start_datetime'].dt.date # drop datetime column workout = workout.drop( ['creation_datetime', 'start_datetime', 'end_datetime'], axis=1) # add created_at, last_updated_by workout['created_at'] = pd.to_datetime('now') workout['updated_at'] = pd.to_datetime('now') workout.fillna(0, inplace=True) # cleanup activity_type and device column text, date workout['gadget'] = np.where( workout['device'].str.contains('Apple Watch'), 'Apple Watch', 'iPhone') # remove HKWorkoutActivityType from activity_type text workout['activity'] = (workout['activity_type'].str.replace( 'HKWorkoutActivityType', '')) # remove unnecessary columns workout = workout[[ 'date', 'activity', 'duration', 'duration_unit', 'total_distance', 'total_distance_unit', 'total_energy_burned', 'total_energy_burned_unit', 'gadget', 'start_date', 'end_date', 'created_at', 'updated_at' ]] ''' # create heartrate data frame print('Creating heartrate data...') li = list(zip(record_type, record_unit, record_value, record_sourceName, record_sourceVersion, record_device, record_creationDate, record_startDate, record_endDate)) record = pd.DataFrame(li, columns=['type', 'unit', 'value', 'source_name', 'source_version', 'device', 'creation_date', 'start_date', 'end_date']) # remove dates before 2000-01-01 record['creation_datetime'] = pd.to_datetime(record['creation_date']) record = record[record['creation_datetime'] > '2000-01-01'] record['start_datetime'] = pd.to_datetime(record['start_date']) record = record[record['start_datetime'] > '2000-01-01'] record['end_datetime'] = pd.to_datetime(record['end_date']) record = record[record['end_datetime'] > '2000-01-01'] record['date'] = record['start_datetime'].dt.strftime('%Y-%m-%d') # drop datetime column record = record.drop(['creation_datetime', 'start_datetime', 'end_datetime'], axis=1) # filter down to heartrate record = record[record['type'] == 'HKQuantityTypeIdentifierHeartRate'] # clean up device data (look for Apple Watch, iPhone) record['gadget'] = np.where(record['device'].str.contains('Apple Watch'), 'Apple Watch', 'iPhone') # decrease columns to necessary info record = record[['date', 'gadget', 'value']] record['value'] = record['value'].astype(float) # aggregate this before adding to db (max, min, avg) record_avg = record.groupby(['date', 'gadget'], as_index=False).mean() record_max = record.groupby(['date', 'gadget'], as_index=False).max() record_min = record.groupby(['date', 'gadget'], as_index=False).min() # combine these into a single df for record record_avg.columns = ['date', 'gadget', 'avg'] record_max.columns = ['date', 'gadget', 'max'] record_min.columns = ['date', 'gadget', 'min'] heartrate = record_avg.merge(record_max, on=['date', 'gadget']) heartrate = heartrate.merge(record_min, on=['date', 'gadget']) # add created_at, last_updated_by heartrate['created_at'] = pd.to_datetime('now') heartrate['updated_at'] = pd.to_datetime('now') heartrate.fillna(0, inplace=True) # import pdb; pdb.set_trace() ''' # csv exports df.to_csv(file_path + 'activity_summary.csv', index=False) exercise_time.to_csv(file_path + 'exercise_time.csv', index=False) workout.to_csv(file_path + 'workout.csv', index=False) # heartrate.to_csv(file_path + 'heartrate.csv', index=False) return df
def iterconfigurations(self): """ Create and return iterator for the available Configuration objects. The iterator loops over all Configurations in the dump file tree, in document order. """ cfg = None cfg_arguments = [ ] # function arguments for Configuration node initialization cfg_function = None cfg_valueflow = None # Iterating <varlist> in a <scope>. iter_scope_varlist = False # Iterating <typedef-info> iter_typedef_info = False # Use iterable objects to traverse XML tree for dump files incrementally. # Iterative approach is required to avoid large memory consumption. # Calling .clear() is necessary to let the element be garbage collected. for event, node in ElementTree.iterparse(self.filename, events=('start', 'end')): # Serialize new configuration node if node.tag == 'dump': if event == 'start': cfg = Configuration(node.get('cfg')) continue elif event == 'end': cfg.setIdMap(cfg_arguments) yield cfg cfg = None cfg_arguments = [] elif node.tag == 'clang-warning' and event == 'start': cfg.clang_warnings.append({ 'file': node.get('file'), 'line': int(node.get('line')), 'column': int(node.get('column')), 'message': node.get('message') }) # Parse standards elif node.tag == "standards" and event == 'start': continue elif node.tag == 'c' and event == 'start': cfg.standards.set_c(node) elif node.tag == 'cpp' and event == 'start': cfg.standards.set_cpp(node) elif node.tag == 'posix' and event == 'start': cfg.standards.set_posix(node) # Parse directives list elif node.tag == 'directive' and event == 'start': cfg.directives.append(Directive(node)) # Parse macro usage elif node.tag == 'macro' and event == 'start': cfg.macro_usage.append(MacroUsage(node)) # Preprocessor #if/#elif condition elif node.tag == "if-cond" and event == 'start': cfg.preprocessor_if_conditions.append( PreprocessorIfCondition(node)) # Parse tokens elif node.tag == 'tokenlist' and event == 'start': continue elif node.tag == 'token' and event == 'start': cfg.tokenlist.append(Token(node)) # Parse scopes elif node.tag == 'scopes' and event == 'start': continue elif node.tag == 'scope' and event == 'start': cfg.scopes.append(Scope(node)) elif node.tag == 'varlist': if event == 'start': iter_scope_varlist = True elif event == 'end': iter_scope_varlist = False # Parse functions elif node.tag == 'functionList' and event == 'start': continue elif node.tag == 'function': if event == 'start': cfg_function = Function(node, cfg.scopes[-1]) continue elif event == 'end': cfg.functions.append(cfg_function) cfg_function = None # Parse function arguments elif node.tag == 'arg' and event == 'start': arg_nr = int(node.get('nr')) arg_variable_id = node.get('variable') cfg_function.argumentId[arg_nr] = arg_variable_id # Parse variables elif node.tag == 'var' and event == 'start': if iter_scope_varlist: cfg.scopes[-1].varlistId.append(node.get('id')) else: var = Variable(node) if var.nameTokenId: cfg.variables.append(var) else: cfg_arguments.append(var) # Parse typedef info elif node.tag == 'typedef-info': iter_typedef_info = (event == 'start') elif iter_typedef_info and node.tag == 'info' and event == 'start': cfg.typedefInfo.append(TypedefInfo(node)) # Parse valueflows (list of values) elif node.tag == 'valueflow' and event == 'start': continue elif node.tag == 'values': if event == 'start': cfg_valueflow = ValueFlow(node) continue elif event == 'end': cfg.valueflow.append(cfg_valueflow) cfg_valueflow = None # Parse values elif node.tag == 'value' and event == 'start': cfg_valueflow.values.append(Value(node)) # Remove links to the sibling nodes node.clear()
def parse_xml(file_name): events = ("start", "end") context = ET.iterparse(file_name, events=events) pt(context)
def getDataFromExternal(self, date, progress_callback = None): xml = self._downloadUrl(self.ontvUrl) io = StringIO.StringIO(xml) context = ElementTree.iterparse(io) return parseXMLTV(context, io, len(xml), None, progress_callback)
print("Processing ", sys.argv[1]) #idx = 0 #nsmap = {} #for event, elem in etree.iterparse(sys.argv[1], events=('start-ns', )): #ns, url = elem #print(ns, url) #nsmap[ns] = url #idx += 1 #if idx == 10: #break #print(nsmap) #idx = 0 data_dict = {} for event, elem in etree.iterparse(sys.argv[1], events=( 'start', 'end', )): tag = elem.tag.split('}')[1] if 'start' == event and 'page' == tag: data_dict = {} elif 'end' == event and 'page' == tag: elem.clear() #print(data_dict['ns'], '-', data_dict['title']) if data_dict['ns'] == '0' and len(data_dict['text']) > 500: # Save to file filepath = sys.argv[2] + '/' + plain_name(data_dict['title']) open(filepath, 'w').write(data_dict['text']) print(" wrote ", filepath) if 'title' == tag and 'end' == event: data_dict['title'] = elem.text
def init_cache(self) -> None: msg = f"Edit file {self.config_file} with AIXM directory" if self.aixm_path is None or self.cache_dir is None: raise RuntimeError(msg) self.full_dict: Dict[str, Any] = {} self.all_points: Dict[str, Point] = {} assert self.aixm_path.is_dir() self.ns: Dict[str, str] = dict() cache_file = self.cache_dir / "aixm.pkl" if cache_file.exists(): with cache_file.open("rb") as fh: try: elts = pickle.load(fh) self.full_dict = elts[0] self.all_points = elts[1] self.tree = elts[2] self.ns = elts[3] self.initialized = True return except Exception: logging.warning("aixm files: rebuilding cache file") for filename in [ "AirportHeliport.BASELINE", "Airspace.BASELINE", "DesignatedPoint.BASELINE", "Navaid.BASELINE", "StandardInstrumentArrival.BASELINE", ]: if not (self.aixm_path / filename).exists(): zippath = zipfile.ZipFile( self.aixm_path.joinpath(f"{filename}.zip").as_posix() ) zippath.extractall(self.aixm_path.as_posix()) # The versions for namespaces may be incremented and make everything # fail just for that reason! for _, (key, value) in ElementTree.iterparse( (self.aixm_path / "Airspace.BASELINE").as_posix(), events=["start-ns"], ): self.ns[key] = value self.tree = ElementTree.parse( (self.aixm_path / "Airspace.BASELINE").as_posix() ) for airspace in self.tree.findall( "adrmsg:hasMember/aixm:Airspace", self.ns ): identifier = airspace.find("gml:identifier", self.ns) assert identifier is not None assert identifier.text is not None self.full_dict[identifier.text] = airspace points = ElementTree.parse( (self.aixm_path / "DesignatedPoint.BASELINE").as_posix() ) for point in points.findall( "adrmsg:hasMember/aixm:DesignatedPoint", self.ns ): identifier = point.find("gml:identifier", self.ns) assert identifier is not None assert identifier.text is not None floats = point.find( "aixm:timeSlice/aixm:DesignatedPointTimeSlice/" "aixm:location/aixm:Point/gml:pos", self.ns, ) assert floats is not None assert floats.text is not None designator = point.find( "aixm:timeSlice/aixm:DesignatedPointTimeSlice/aixm:designator", self.ns, ) type_ = point.find( "aixm:timeSlice/aixm:DesignatedPointTimeSlice/aixm:type", self.ns, ) name = designator.text if designator is not None else None type_str = type_.text if type_ is not None else None coords = tuple(float(x) for x in floats.text.split()) self.all_points[identifier.text] = Point( coords[0], coords[1], name, type_str ) points = ElementTree.parse( (self.aixm_path / "Navaid.BASELINE").as_posix() ) for point in points.findall("adrmsg:hasMember/aixm:Navaid", self.ns): identifier = point.find("gml:identifier", self.ns) assert identifier is not None assert identifier.text is not None floats = point.find( "aixm:timeSlice/aixm:NavaidTimeSlice/" "aixm:location/aixm:ElevatedPoint/gml:pos", self.ns, ) assert floats is not None assert floats.text is not None designator = point.find( "aixm:timeSlice/aixm:NavaidTimeSlice/aixm:designator", self.ns ) type_ = point.find( "aixm:timeSlice/aixm:NavaidTimeSlice/aixm:type", self.ns ) name = designator.text if designator is not None else None type_str = type_.text if type_ is not None else None coords = tuple(float(x) for x in floats.text.split()) self.all_points[identifier.text] = Point( coords[0], coords[1], name, type_str ) with cache_file.open("wb") as fh: pickle.dump( (self.full_dict, self.all_points, self.tree, self.ns), fh ) self.initialized = True
#print to_write offtet=mini+' '+str(offset) offset+=(len(to_write)+1) offtxt.write(offtet+'\n') outtxt.write(to_write+'\n'); flag=0 for i in range(chunknum): if com_read[i]==0: flag=1 if flag==0: #pass break outtxt.close() offtxt.close() #print ind for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')): tname = strip_tag_name(elem.tag) if event == 'start': if tname == 'page': title = '' hitext = '' rtitle = '' id = -1 redirect = '' inrevision = False ns = 0 l_hitext=[] dicw={} elif tname == 'revision': # Do not pick up on revision id's
def main(argv): inputfolder = '' try: opts, args = getopt.getopt(argv, "hi:", ["ifile="]) except getopt.GetoptError: print 'parse.py -i <inputfolder>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'parse.py -i <inputfolder>' sys.exit() elif opt in ("-i", "--ifile"): inputfolder = arg if inputfolder == '': print 'parse.py -i <inputfolder>' sys.exit(2) if not inputfolder.endswith('/'): inputfolder = inputfolder + '/' files = [] for entry in os.listdir(inputfolder): if entry.endswith('.xml'): files.append(inputfolder + entry) db = MySQLdb.connect(host="127.0.0.1", user="******", passwd="", db="patent_research_base") cur = db.cursor() company_id = '' print '\n\n### Running matches ###\n' for inputfile in files: ignorecount = failcount = successcount = 0 total_size = os.path.getsize(inputfile) file = open(inputfile, 'r') for event, elem in etree.iterparse(file): if event is 'end' and elem.tag.endswith('record'): pubyear = int(elem.getchildren()[8].text.split('/')[0]) # if (pubyear <= 2012 and pubyear >= 1982): pat_id = str(elem.getchildren()[1].text) check_query = "SELECT EXISTS(SELECT 1 FROM processed_patents WHERE patent_id='%s')" % ( pat_id) cur.execute(check_query) if cur.fetchone()[0] != 1: patent_id = capturepatid(pat_id) inventors = splitinventors(elem.getchildren()[6], elem.getchildren()[7]) for owner in elem.getchildren()[2]: if owner.text: company_id = owner.text try: citing = int(elem.getchildren()[3].text) except: citing = 0 try: cited = int(elem.getchildren()[4].text) except: cited = 0 classification = str(elem.getchildren()[5].text) patent = { 'PubYear': pubyear, 'Pat_ID': pat_id, 'Patent_ID': patent_id, 'Inventors': inventors, 'Company_ID': company_id, 'Citing': citing, 'Cited': cited, 'Classification': classification } result = findpatent(patent, db) if not result: failcount += 1 else: successcount += 1 update_query = "INSERT INTO processed_patents (patent_id) VALUES('%s')" % ( pat_id) cur.execute(update_query) db.commit() else: ignorecount += 1 # else: # ignorecount += 1 progress = float(file.tell()) / total_size sys.stdout.write( '\rFilename: %s - Processed: %s\t| Failed: %d Success: %d Ignored: %d' % (inputfile, "{:.0%}".format(progress), failcount, successcount, ignorecount)) sys.stdout.flush() move_destination = "./processed/" + \ inputfile.split('/')[len(inputfile.split('/')) - 1] shutil.move(inputfile, move_destination) print ''
def audit_st_tp(filename): problem_street_types = defaultdict(set) for event, elem in ET.iterparse(filename): if is_street_name(elem): expected_street_type(problem_street_types, elem.attrib['v']) return problem_street_types
def process_map(filename): users = set() for _, element in ET.iterparse(filename): if 'uid' in element.attrib: users.add(element.attrib['uid']) return users
def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress): self.uniprot = uniprot self.uniprot_qid = base_map[uniprot]['qid'] self.ensp = set() self.ncbip = set() self.go_terms = set() self.login = login self.go_prop_map = go_prop_map self.entrez = base_map[uniprot]['entrez']['id'] self.entrez_quid = base_map[uniprot]['entrez']['qid'] self.res_id = base_map[uniprot]['entrez']['res_id'] self.label = '' self.description = '' self.aliases = set() self.tax_id = '' self.annotation_type = '' self.statements = [] self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid} start = time.time() if not os.path.exists('./data/uniprot_raw'): os.makedirs('./data/uniprot_raw') # check if Uniprot xml exists and its age? r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format( self.uniprot)) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w') f.write(r.text) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r') # check if XML can be properly parsed, log obsolete items for permanent removal. try: for event, e in Et.iterparse(f, events=('start', 'end')): if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry': if 'dataset' in e.attrib: self.annotation_type = e.attrib['dataset'] if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein': tmp = e.find( './{http://uniprot.org/uniprot}recommendedName/' '{http://uniprot.org/uniprot}fullName') if tmp is not None: self.label = tmp.text elif e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName' ) is not None: self.label = e.find( './{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName').text for prop in e.findall( './{http://uniprot.org/uniprot}alternativeName/'): self.aliases.add(prop.text) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism': for prop in e.findall( './{http://uniprot.org/uniprot}dbReference'): if prop.attrib['type'] == 'NCBI Taxonomy': self.tax_id = prop.attrib['id'] # print(e) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'Ensembl': for prop in e.findall( './{http://uniprot.org/uniprot}property'): if prop.attrib['type'] == 'protein sequence ID': self.ncbip.add(prop.attrib['value']) self.statements.append( PBB_Core.WDString( value=prop.attrib['value'], prop_nr='P705', references=[self.create_reference()])) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'RefSeq': self.ncbip.add(e.attrib['id']) self.statements.append( PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637', references=[self.create_reference() ])) # get alternative identifiers for gene to protein mapping if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes: res_id = e.attrib['id'] if res_id in res_id_to_entrez_qid: self.entrez_quid = res_id_to_entrez_qid[res_id][0] except Et.ParseError as e: print( 'Error when parsing Uniprot {} XML file, item {} most likely obsolete' .format(self.uniprot, self.uniprot_qid)) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start)) return # get GO annotations from QuickGO params = {'format': 'tsv', 'limit': '1000', 'protein': self.uniprot} url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation' try: itrt = iter( requests.get(url, params=params).text.strip('\n ').split('\n')) next(itrt) # skip header line for line in itrt: cols = line.split('\t') go_id = cols[6] evidence_code = cols[9] go_aspect = cols[11][0] if self.uniprot not in pdb_to_go: pdb_to_go[self.uniprot] = { 'go_terms': list(), 'evidence': list(), 'pdb': set() } pdb_to_go[self.uniprot]['go_terms'].append(go_id) pdb_to_go[self.uniprot]['evidence'].append(evidence_code) if go_id in go_prop_map: go_prop_map[go_id][ 'go_class_prop'] = ProteinBot.get_go_class( go_id, go_aspect) except requests.HTTPError: pass except IndexError: pass # set description according to the annotation the Uniprot entry is coming from self.description = self.descr_map[self.tax_id]['en'] if self.annotation_type == 'TrEMBL': self.description += ' (annotated by UniProtKB/TrEMBL {})'.format( self.uniprot) elif self.annotation_type == 'Swiss-Prot': self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format( self.uniprot) # assign a GO term a GO subontology/OBO namespace if self.uniprot in pdb_to_go: for go in set(pdb_to_go[self.uniprot]['go_terms']): # check if a GO term is not yet in Wikidata # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it if go not in go_prop_map: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='GO term not in Wikidata exception', message= 'GO term {} not found in Wikidata, skipping this one' .format(go), wd_id=self.uniprot_qid, duration=time.time() - start)) print( 'GO term {} not found in Wikidata, skipping this one'. format(go)) continue # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class if not go_prop_map[go]['go_class_prop']: go_class_prop = ProteinBot.get_go_class(go) if not go_class_prop: continue go_prop_map[go]['go_class_prop'] = go_class_prop print('added class code {} to {}'.format( go_prop_map[go]['go_class_prop'], go)) # create a set of WD QIDs representing GO evidence code items in WD evidence = { self.go_evidence_codes[ev] for count, ev in enumerate(pdb_to_go[self.uniprot] ['evidence']) if pdb_to_go[self.uniprot]['go_terms'][count] == go } # iterate though the evidence code set and create a new qualifier for each one qualifiers = [ PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev ] # Create Wikidata GO term value prop_nr = self.go_prop_map[go]['go_class_prop'] qid = self.go_prop_map[go]['qid'] self.statements.append( PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers, references=[self.create_reference()])) for pdb in pdb_to_go[self.uniprot]['pdb']: self.statements.append( PBB_Core.WDString(value=pdb.upper(), prop_nr='P638', references=[self.create_reference()])) self.statements.append( PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()])) if self.entrez_quid != '': self.statements.append( PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702', references=[self.create_reference()])) current_taxonomy_id = self.taxon_map[self.tax_id] self.statements.append( PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703', references=[self.create_reference()])) self.statements.append( PBB_Core.WDString(value=self.uniprot, prop_nr='P352', references=[self.create_reference()])) # remove all Wikidata properties where no data has been provided, but are handled by the bot all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements)) for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P702']: if pr not in all_stmnt_props: self.statements.append( PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr)) try: new_msg = '' if self.uniprot_qid != '': wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements) else: wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements) new_msg = 'new protein created' wd_item.set_label(self.label) wd_item.set_description(self.description) wd_item.set_aliases(aliases=self.aliases, append=False) self.uniprot_qid = wd_item.write(self.login) if self.entrez_quid != '': encodes = PBB_Core.WDItemID( value=self.uniprot_qid, prop_nr='P688', references=[self.create_reference()]) gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688']) gene_item.write(login) progress[self.uniprot] = self.uniprot_qid PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(self.uniprot), exception_type='', message='success{}'.format(new_msg), wd_id=self.uniprot_qid, duration=time.time() - start)) # pprint.pprint(wd_item.get_wd_json_representation()) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start)) traceback.print_exc() print(self.label) print(self.aliases) print(self.tax_id)
def count_rib(filename): cont = 0 for _, element in ET.iterparse(filename): if 'user' in element.attrib: cont = cont + 1 return cont
def main(): total_size = 0 root = None question, topic = None, None with open(all_questions_file_path, 'w') as qf, open(all_topics_file_path, 'w') as tf: for event, elem in etree.iterparse(input_xml_file, events=('start', 'end')): logger.debug("event: {}, elemtag: {}".format(event, elem.tag)) # keep track of the root element if event == 'start' and elem.tag == 'ystfeed': root = elem # track the data elements needed for the dataset if event == 'end' and elem.tag == 'subject': question = elem.text if event == 'end' and elem.tag == 'maincat': topic = elem.text.strip() # write data to file if is_valid(question, topic): qf.write("{}\n".format(clean_text(question))) tf.write("{}\n".format(topic)) total_size += 1 # when a data instance is completely read, clear root if event == 'end' and elem.tag == 'vespaadd': root.clear() logger.info("{} questions read".format(total_size)) val_size = int(dev_proportion * total_size) test_size = int(test_proportion * total_size) logger.info("total_size: {}".format(total_size)) logger.info("val_size: {}".format(val_size)) logger.info("test_size: {}".format(test_size)) logger.info("Creating dataset splits ...") indices = list(range(total_size)) random.shuffle(indices) val_indices = set(indices[:val_size]) test_indices = set(indices[val_size:val_size + test_size]) with open(all_questions_file_path) as qf, open(all_topics_file_path) as tf, \ open(val_questions_file_path, 'w') as qfval, open(val_topics_file_path, 'w') as tfval, \ open(test_questions_file_path, 'w') as qftest, open(test_topics_file_path, 'w') as tftest, \ open(train_questions_file_path, 'w') as qftrain, open(train_topics_file_path, 'w') as tftrain: for index, (question, topic) in enumerate(zip(qf, tf)): if index in val_indices: qfile = qfval tfile = tfval elif index in test_indices: qfile = qftest tfile = tftest else: qfile = qftrain tfile = tftrain qfile.write("{}".format(question)) tfile.write("{}".format(topic)) logger.info("Finished data preprocessing")
def __init__(self, handle): self.xml_iter = iter( ElementTree.iterparse(handle, events=('start', 'end'))) self._meta, self._fallback = self._parse_preamble()
def read_doc_input(inputxml, inputparsed, outputfile): ''' input: input document xml file and Stanford CoreNLP output output: 1. a new xml file of splitted sentences 2. a txt file with one sentence in each line for word segmentation in the later step ''' #read input xml file, store documents in a dictionary. #the key of dictionary is the text part of document, the value is the infomartion about the document, e.g date,id docdict = {} doctexts = [] output = [] tree = ET.iterparse(inputxml) for event, elem in tree: if event == "end" and elem.tag == "Article": story = elem # Check to make sure all the proper XML attributes are included attribute_check = [ key in story.attrib for key in ['date', 'id', 'mongoId', 'sentence', 'source'] ] if not attribute_check: print('Need to properly format your XML...') break entry_id = story.attrib['id'] mongoid = story.attrib['mongoId'] date = story.attrib['date'] date = date[0:date.find("T")].replace("-", "") sentence = story.attrib['sentence'] source = story.attrib['source'] text = story.find('Text').text if text is None: text = "" else: text = text.replace('\n', ' ').strip() if entry_id in docdict: print( 'id must be unique, this article is in document dictionary :' + entry_id) break docdict[text] = { 'id': entry_id, 'date': date, 'mongoid': mongoid, 'sentence': sentence, 'source': source, 'text': text } doctexts.append(text) elem.clear() #read Stanford CoreNLP parsed file parsed = open(inputparsed) parsedfile = parsed.readlines() parsedlines = [] #for line in parsedfile: #if "Sentence #" in line or "[" in line: # continue #else: #print(line) #parsedlines.append(line.replace("\n"," ").strip()) i = 0 while i < len(parsedfile): line = parsedfile[i] if "Sentence #" in line: i = i + 1 continue elif not line.startswith('['): temp = line i = i + 1 line = parsedfile[i] while (not line.startswith('[')): temp = temp + line i = i + 1 line = parsedfile[i] #print(temp) parsedlines.append(temp.replace('\n', ' ').strip()) i = i + 1 #match CoreNLP parsed file with input xml file sents_dict = {} sents = [] sentidx = 1 #print(len(doctexts)) #print(len(parsedlines)) #raw_input("Press Enter to continue...") processed = 0 for line in parsedlines: doc = doctexts[0] #print(doc) #print(line+"#") #print(isinstance(doc,str)) #print(isinstance(line,str)) #print(doc.encode('UTF-8').find(line)) #break #''' line = line.replace(">", ">").replace("<", "<").replace("&", "&") if doc.encode('UTF-8').find(line) == -1: #print(processed) #if processed>=33223: # print(line) # print(doc) #raw_input("Press Enter to continue...") doctexts.remove(doc) sentidx = 1 doc = doctexts[0] if doc.encode('UTF-8').find(line) != -1: #print(docdict[doc]['id']+"#"+line) key = docdict[doc]['id'] + "#" + line sents.append(key) output.append(line + "\n") sents_dict[key] = {} sents_dict[key]['sentence_id'] = str(sentidx) sents_dict[key].update(docdict[doc]) #print(sents_dict[key]['sentence_id']+":"+key) sentidx = sentidx + 1 processed = processed + 1 #''' #print(len(parsedlines)) #print(len(sents)) #for sent in sents: #print(sent) #print(sents_dict.get(sent).get('sentence_id')) #print(sents_dict[sent]['sentence_id']+":"+key) create_sentence_xml(sents, sents_dict, inputxml + "-sent.xml") ofile = open(outputfile, 'w') for line in output: ofile.write(line) ofile.close()
def iterparse(source, events=('end', ), remove_comments=True, **kw): """Thin wrapper around ElementTree.iterparse""" return ElementTree.iterparse(source, events, SourceLineParser(), **kw)
def process_file_with_policy(file_name, policy_id, original_name): count = 0 datadict = {} specialdict = {} valuedict = {} highdict = {} lowdict = {} newst = '' new_key = '' policy = Policy.objects.get(id=policy_id) operations = Operation.objects.filter(policy=policy) for op in operations: if op.op_name == 'average': datadict[op.signal_name] = 0 elif op.op_name == 'exceeds': specialdict[op.signal_name] = 0 valuedict[op.signal_name] = op.cut_off_number else: new_key = op.signal_name + "-" + op.second_signal_name datadict[new_key] = 0 highdict[op.signal_name] = 0 lowdict[op.second_signal_name] = 0 yhigh = 0 ylow = 0 for event, elem in ET.iterparse(file_name, events=('start', 'end')): if event == 'start': if elem.tag == 'frame': count += 1 if event == 'end': key = elem.get("key") if key is not None: if key in datadict: value = elem.get("value") datadict[key] += float(value) if key in specialdict and float(value) > valuedict[key]: specialdict[key] += 1 if key in highdict: yhigh = float(value) if key in lowdict: ylow = float(value) diff = abs(yhigh - ylow) datadict[new_key] += diff elem.clear() result_name = original_name + ".xml" current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") current_time = datetime.strptime(current_time_str, "%Y-%m-%d %H:%M:%S") new_result = Result( filename=result_name, policy_id=policy_id, policy_name=policy.policy_name, processed_time=current_time, task_id=1, status=True) new_result.save() result = Result.objects.get(filename=result_name, processed_time=current_time_str) for k in datadict.keys(): v = datadict[k] ave = v/count new_row = Row( result=result, signal_name=k, result_number=ave, op_name='average' ) new_row.save() for k, v in specialdict.items(): new_row = Row( result=result, signal_name=k, result_number=v, op_name='exceeded (out of {0} frames)'.format(count), cut_off_number=valuedict[k] ) new_row.save() return result