def fix_broken_xml(path=AUTOSAVE_FILE): """ Removes unwanted characters and (in case necessary in future implementations..) fixes other parsing errors with the xml file. """ with io.open(path, 'r', encoding='utf-8') as f: content = f.read() safe_string = remove_control_characters(content) with open(path, 'wt') as f: f.write(safe_string) xmlp = ETree.XMLParser(encoding="utf-8") try: parser = ETree.parse(path, xmlp) except ETree.ParseError: print('Fatal Error with xml structure. A backup of your autosave has been made.') backup_autosave_file(path, content) create_empty_autosave() xmlp = ETree.XMLParser(encoding="utf-8") parser = ETree.parse(path, xmlp) print(parser) return parser
def request_from_api(self, page_number=1): try: self.__make_request(page_number) except: raise if self.__response_type == 'xml': if self.__raw_response == '' or self.__raw_response is None or len( self.__raw_response) == 0: return True utf8_parser = etree.XMLParser(encoding='UTF-8') sio = BytesIO(self.__raw_response) xml = etree.parse(sio, parser=utf8_parser) # Set the XML object to the first returned. Will be replaced if there is pagination self.__xml_object = xml.getroot() for pagination in xml.findall('.//t:pagination', namespaces=self.ns_map): # page_number = int(pagination.get('pageNumber')) page_size = int(pagination.get('pageSize')) total_available = int(pagination.get('totalAvailable')) total_pages = int( math.ceil(float(total_available) / float(page_size))) full_xml_obj = None for obj in xml.getroot(): if obj.tag != 'pagination': full_xml_obj = obj combined_xml_obj = copy.deepcopy(full_xml_obj) if total_pages > 1: for i in range(2, total_pages + 1): self.__make_request(i) # Get next page utf8_parser2 = etree.XMLParser(encoding='utf-8') xml = etree.parse(BytesIO(self.__raw_response), parser=utf8_parser2) for obj in xml.getroot(): if obj.tag != 'pagination': full_xml_obj = obj # This is the actual element, now need to append a copy to the big one for e in full_xml_obj: combined_xml_obj.append(e) self.__xml_object = combined_xml_obj self.log_debug("Logging the combined xml object") self.log_debug( etree.tostring(self.__xml_object, encoding='utf-8').decode('utf-8')) self.log("Request succeeded") return True elif self.__response_type in ['binary', 'png', 'csv']: self.log('Non XML response') return True
def parse_summary(folder_path, num_files, allowed_keys): num_files_read = 0 raw_texts = {} for root, dirs, files in os.walk(folder_path): summary_file = "perdocs" if summary_file in files: with open(os.path.join(root, summary_file)) as f: it = itertools.chain('<root>', f, '</root>') parser = ET.XMLParser(encoding="us-ascii") root = ET.fromstringlist(it, parser=parser) text_tags = root.findall("SUM") for text_tag in text_tags: if text_tag.get("DOCREF") in allowed_keys: raw_texts[text_tag.get("DOCREF")] = text_tag.text if raw_texts.keys() == allowed_keys: break else: continue return raw_texts
def downloadXMLTV(): cleanName = re.sub(r'[\<\>\:\"\/\\\|\?\*]', '_', str(jglob.name)) cleanName = re.sub(r' ', '_', cleanName) cleanName = re.sub(r'_+', '_', cleanName) cleanNameOld = re.sub(r'[\<\>\:\"\/\\\|\?\* ]', '_', str(jglob.old_name)) cleanNameOld = re.sub(r' ', '_', cleanNameOld) cleanNameOld = re.sub(r'_+', '_', cleanNameOld) jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanName) + '.xmltv.xml') jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanNameOld) + '.xmltv.xml') jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanName) + '.xmltv2.xml') jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanNameOld) + '.xmltv2.xml') filepath = '/etc/epgimport/' epgfilename = 'jmx.' + str(cleanName) + '.xmltv.xml' epgpath = filepath + epgfilename response = downloads.checkGZIP(jglob.xmltv_address) if response is not None: with open(epgpath, 'w') as f: f.write(response) with open(epgpath, 'r') as f: # tree = ET.parse(f) tree = ET.parse(f, parser=ET.XMLParser(encoding='utf-8')) tree.write('/etc/epgimport/' + 'jmx.' + str(cleanName) + '.xmltv2.xml', encoding='utf-8', xml_declaration=True)
def _get_herd_email(self, herd): """Get a herd's email address. @type herd: str @param herd: herd whose email you want @rtype: str or None @return: email address or None if herd is not in herds.xml @raise IOError: if $PORTDIR/metadata/herds.xml can not be read """ if self._herdstree is None: try: self._herdstree = etree.parse(_unicode_encode(self._herds_path, encoding=_encodings['fs'], errors='strict'), parser=etree.XMLParser(target=_MetadataTreeBuilder())) except (ImportError, IOError, SyntaxError): return None # Some special herds are not listed in herds.xml if herd in ('no-herd', 'maintainer-wanted', 'maintainer-needed'): return None try: # Python 2.7 or >=3.2 iterate = self._herdstree.iter except AttributeError: iterate = self._herdstree.getiterator for node in iterate('herd'): if node.findtext('name') == herd: return node.findtext('email')
def load_instances(f): ''' Load two lists of cases to perform WSD on. The structure that is returned is a dict, where the keys are the ids, and the values are instances of WSDInstance. ''' tree = ET.parse(f, ET.XMLParser(encoding="utf-8")) root = tree.getroot() dev_instances = {} test_instances = {} for text in root: if text.attrib['id'].startswith('d001'): instances = dev_instances else: instances = test_instances for sentence in text: # construct sentence context context = [el.attrib['lemma'] for el in sentence] for i, el in enumerate(sentence): if el.tag == 'instance': my_id = el.attrib['id'] lemma = el.attrib['lemma'] instances[my_id] = WSDInstance(my_id, lemma, context, i) return dev_instances, test_instances
def run(self, test_run_factory, **kwargs): """ Entry point to start parsing the stream with which the parser was initialized. :param test_run_factory: A class from which to instantiate a "test_run" object whose add\*/set\* methods will be called as elements are found in the stream :returns: a SubmissionResult instance. This is not really used and seems redundant, as the data will be processed and stored by the TestRun instance (which is, however, also not returned anywhere). """ parser = etree.XMLParser() tree = etree.parse(self.file, parser=parser) root = tree.getroot() if root.tag != "system": raise AssertionError( "Unexpected tag <%s> at root, expected <system>" % root.tag) result = SubmissionResult(test_run_factory, **kwargs) self.parseRoot(result, root) return result
def obfuscate(self, obfuscation_info: Obfuscation): self.logger.info('Running "{0}" obfuscator'.format( self.__class__.__name__)) try: # Change default namespace. Xml.register_namespace( "obfuscation", "http://schemas.android.com/apk/res/android") xml_parser = Xml.XMLParser(encoding="utf-8") manifest_tree = Xml.parse(obfuscation_info.get_manifest_file(), parser=xml_parser) manifest_root = manifest_tree.getroot() self.remove_xml_duplicates(manifest_root) self.scramble_xml_element(manifest_root) self.indent_xml(manifest_root) # Write the changes into the manifest file. manifest_tree.write(obfuscation_info.get_manifest_file(), encoding="utf-8") except Exception as e: self.logger.error( 'Error during execution of "{0}" obfuscator: {1}'.format( self.__class__.__name__, e)) raise finally: obfuscation_info.used_obfuscators.append(self.__class__.__name__)
def fromstring(self, string): '''Set the contents of this tree from XML representation.''' parser = ElementTreeModule.XMLParser() parser.feed(string) root = parser.close() self._etree._setroot(root) return self # allow ParseTree().fromstring(..)
def initfromfile(self): filename = botslib.abspathdata(self.ta_info['filename']) self.ta_info['attributemarker'] = '__' parser = ET.XMLParser() etree = ET.ElementTree() #ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed etreeroot = etree.parse(filename, parser) self.root = self._etree2botstree(etreeroot) #convert etree to bots-nodes-tree
def svgMerge(box, inkscape, output): parser = ElementTree.XMLParser(remove_blank_text=True) src_tree = ElementTree.parse(box, parser) dest_tree = ElementTree.parse(inkscape, parser) dest_root = dest_tree.getroot() src_width, src_height = getSizeInMM(src_tree) dest_width, dest_height = getSizeInMM(dest_tree) src_scale_x, src_scale_y = ticksPerMM(src_tree) dest_scale_x, dest_scale_y = ticksPerMM(dest_tree) scale_x = dest_scale_x / src_scale_x scale_y = dest_scale_y / src_scale_y src_view = getViewBox(src_tree) off_x = src_view[0] * -scale_x off_y = (src_view[1]+src_view[3]) * -scale_y + dest_height * scale_y for el in src_tree.getroot(): import sys dest_root.append(el) if el.tag.endswith("g"): el.set("transform", "matrix(%f,0,0,%f, %f, %f)" % ( scale_x, scale_y, off_x, off_y)) # write the xml file ElementTree.ElementTree(dest_root).write(output, pretty_print=True, encoding='utf-8', xml_declaration=True)
def load_booking_reviews(): """ :return: """ import xml.etree.cElementTree as ET BOOKING_XML_FILE = os.path.join(DATA_DIR, "booking.xml") parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(BOOKING_XML_FILE, parser=parser) root = tree.getroot() reviews = [] for item in root: for child in item: if child.tag == "Positivereview": review_text = child.text if review_text is not None: if not isinstance(review_text, unicode): review_text = review_text.decode('utf-8') reviews.append(Review(review_text, REVIEW_MARK.POSITIVE)) if child.tag == "Negativereview": review_text = child.text if review_text is not None: if not isinstance(review_text, unicode): review_text = review_text.decode('utf-8') reviews.append(Review(review_text, REVIEW_MARK.NEGATIVE)) return reviews
def _getXmlDocumentItem(self, fileName): filePath = '%s/%s.xml' % (self._dbFolder, fileName) xmlDoc = self.xmlCache.get(filePath) if xmlDoc: return xmlDoc doc = None # try different encoding and configurations encodingArray = ['utf-8', 'iso-8859-5'] for docEncoding in encodingArray: try: doc = ET.parse(filePath, parser=ET.XMLParser(encoding=docEncoding)) if doc is not None: print('parse %s success. encoding = %s' % (fileName, docEncoding)) break except: print('parse %s failed. encoding = %s' % (fileName, docEncoding)) # traceback.print_exc() continue if doc is None: print('parse %s failed' % (fileName, )) return XmlDocItem(None) xmlDoc = XmlDocItem(doc) self.xmlCache[filePath] = xmlDoc return xmlDoc
def read_news_from_xml_file(): import xml.etree.cElementTree as ET parser = ET.XMLParser(encoding='utf-8') tree = ET.parse("files/newsafr.xml", parser) root = tree.getroot() news_description_generator = ( x.text for x in root.findall('channel/item/description')) return news_description_generator
def extract_single(xml_path, html_path, md_path): print('File: ', xml_path) tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8')) root = tree.getroot() print(root[0].tag) titles = extract_titles(html_path) for k in titles.keys(): print(k, ' ', titles[k]) recursively_extract_text(root, titles, md_path)
def _xml_get(self, url, **params): response = self.session.get('%s%s' % (self.endpoint, url), headers={'Accept': 'application/xml'}, params=params) response.raise_for_status() parser = etree.XMLParser(encoding='utf-8') parser.feed(response.text.encode('utf-8')) return parser.close()
def fromstringlist(sequence, parser=None): """ Taken from Python2.7 source """ if not parser: parser = ET.XMLParser(target=ET.TreeBuilder()) for text in sequence: parser.feed(text) return parser.close()
def pretty_xmlfile(fname): from lxml import etree as ET parser = ET.XMLParser( remove_blank_text=False, resolve_entities=True, strip_cdata=True) xmlfile = ET.parse(fname, parser) pretty_xml = ET.tostring( xmlfile, encoding = 'UTF-8', xml_declaration = True, pretty_print = True) file = open(fname, "w") file.writelines(pretty_xml) file.close()
def parse_indexdirs(filein): "returns a dict of Scan objects, indexed by pfx" d = {} # returned parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(filein, parser=parser) root = tree.getroot() for entry in root: rec = Scan(entry) d[rec.pfx] = rec return d
def load_file(): import xml.etree.cElementTree as ET parser = ET.XMLParser(encoding='utf-8') tree = ET.parse("files/newsafr.xml", parser) root = tree.getroot() news_data = [ x.text for x in root.findall('channel/item/description') ] return news_data
def parse_xml(path): """XML parsing with fail back to alternate encodings""" encodings = [None] + ALT_ENCODINGS[:] for e in encodings: p = ET.XMLParser(encoding=e) try: return ET.parse(path, parser=p) except ET.ParseError: pass return None
def recvXML(self): """reads xml object from socket""" target = self.XMLHandler() parser = ET.XMLParser(target=target) while not target.done_parsing: parser.feed(self.connection.recv(1024)) parser.close() if target.error != 0: print("Got error number {:} from WaveLabs software: {:}".format(target.error, target.error_message)) return target
def autosave_can_be_parsed(): """ Return True if the autosave file is writable and not corrupted. """ global AUTOSAVE_FILE xmlp = ETree.XMLParser(encoding="utf-8") try: parser = ETree.parse(AUTOSAVE_FILE, xmlp) return True except Exception: return False
def xml_to_text(file_path): parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(file_path, parser) root = tree.getroot() description_list = root.findall("channel/item/description") all_words_list = [] for description in description_list: for word in description.text.lower().split(): if len(word) > 6: all_words_list.append(word) return all_words_list
def run(self, test_run_factory, **kwargs): parser = etree.XMLParser() tree = etree.parse(self.file, parser=parser) root = tree.getroot() if root.tag != "system": raise AssertionError( "Unexpected tag <%s> at root, expected <system>" % root.tag) result = SubmissionResult(test_run_factory, **kwargs) self.parseRoot(result, root) return result
def encrypt_string_resources(self, string_resources_xml_file: str, string_names_to_encrypt: Set[str]): xml_parser = Xml.XMLParser(encoding='utf-8') xml_tree = Xml.parse(string_resources_xml_file, parser=xml_parser) for xml_string in xml_tree.iter('string'): string_name = xml_string.get('name', None) string_value = xml_string.text if string_name and string_value and string_name in string_names_to_encrypt: encrypted_string_value = self.encrypt_string(string_value) xml_string.text = encrypted_string_value xml_tree.write(string_resources_xml_file, encoding='utf-8')
def get_elements(self): """ get and parse xml/ :return: elements list """ uri = self.rss_rules.get('uri') if uri.startswith('http'): uri = urlopen(uri) tree = etree.parse(uri, etree.XMLParser(encoding='utf-8')) root = tree.getroot() element = ".//{0}".format(self.rule.get('elements')) items = root.findall(element) return items
def importminidump(filename, cur): printupdate("Importing minidump") fp = bz2.BZ2File(filename) parser = ET.XMLParser(target = MinidumpCB(cur)) while True: data = fp.read(65536) if not data: break parser.feed(data) parser.close() fp.close()
def encrypt_string_array_resources(self, string_array_resources_xml_file: str, string_array_names_to_encrypt: Set[str]): xml_parser = Xml.XMLParser(encoding='utf-8') xml_tree = Xml.parse(string_array_resources_xml_file, parser=xml_parser) for xml_string_array in xml_tree.iter('string-array'): string_array_name = xml_string_array.get('name', None) if string_array_name and string_array_name in string_array_names_to_encrypt: for item in xml_string_array.iter('item'): if item.text: encrypted_string_value = self.encrypt_string(item.text) item.text = encrypted_string_value xml_tree.write(string_array_resources_xml_file, encoding='utf-8')
def parseheader(s): "s is a Scan object" pfx = s.pfx pfx1 = pfx.lower() headerfile = "../../%sScan/%s/downloads/%sheader.xml" % (s.pfx, s.year, pfx1) #try: if True: #namespaces make finding harder with ET. # So, read the file as a string, remove namespace, then parse as string with codecs.open(headerfile, "r", "utf-8") as f: xmlstring0 = f.read() xmlns = 'xmlns="//www.tei-c.org/ns/1.0"' xmlstring = re.sub(xmlns, '', xmlstring0) with codecs.open("temp.xml", "w", "utf-8") as f: f.write(xmlstring) parser = ET.XMLParser(encoding="utf-8") #tree = ET.fromstring(xmlstring,parser=parser) tree = ET.parse("temp.xml", parser=parser) #except: # print "parseheader ERROR for ",headerfile # return root = tree.getroot() #root = tree node = root.find("teiHeader/fileDesc/titleStmt/title") # first s.title = node.text """ node = root.find("teiHeader/fileDesc") for child in node: print child.tag exit(1) """ node = root.find("teiHeader/fileDesc/extent") # first s.dsize = node.text # size of digitization node = root.find("teiHeader/fileDesc/publicationStmt/date") # first s.dyear = s.year # year of digitization monogr = root.find("teiHeader/fileDesc/sourceDesc/biblStruct/monogr") #print "monogr=",monogr nodes = root.findall( "teiHeader/fileDesc/sourceDesc/biblStruct/monogr/author") # first nodes1 = [node.text for node in nodes if node.text != None] s.authors = ','.join([name for name in nodes1 if name.strip() != '']) node = root.find( 'teiHeader/fileDesc/sourceDesc/biblStruct/monogr/imprint/date') s.textdate = node.text node = root.find('teiHeader/fileDesc/sourceDesc/biblStruct/monogr/extent') s.textpages = node.text