Пример #1
0
def fix_broken_xml(path=AUTOSAVE_FILE):
    """ Removes unwanted characters and
    (in case necessary in future
    implementations..) fixes other
    parsing errors with the xml file.
    """
    with io.open(path, 'r', encoding='utf-8') as f:
        content = f.read()

    safe_string = remove_control_characters(content)

    with open(path, 'wt') as f:
        f.write(safe_string)

    xmlp = ETree.XMLParser(encoding="utf-8")
    try:
        parser = ETree.parse(path, xmlp)
    except ETree.ParseError:
        print('Fatal Error with xml structure. A backup of your autosave has been made.')
        backup_autosave_file(path, content)
        create_empty_autosave()
        xmlp = ETree.XMLParser(encoding="utf-8")
        parser = ETree.parse(path, xmlp)
        print(parser)
    return parser
    def request_from_api(self, page_number=1):
        try:
            self.__make_request(page_number)
        except:
            raise
        if self.__response_type == 'xml':
            if self.__raw_response == '' or self.__raw_response is None or len(
                    self.__raw_response) == 0:
                return True
            utf8_parser = etree.XMLParser(encoding='UTF-8')
            sio = BytesIO(self.__raw_response)
            xml = etree.parse(sio, parser=utf8_parser)
            # Set the XML object to the first returned. Will be replaced if there is pagination
            self.__xml_object = xml.getroot()

            for pagination in xml.findall('.//t:pagination',
                                          namespaces=self.ns_map):

                # page_number = int(pagination.get('pageNumber'))
                page_size = int(pagination.get('pageSize'))
                total_available = int(pagination.get('totalAvailable'))
                total_pages = int(
                    math.ceil(float(total_available) / float(page_size)))

                full_xml_obj = None
                for obj in xml.getroot():
                    if obj.tag != 'pagination':
                        full_xml_obj = obj
                combined_xml_obj = copy.deepcopy(full_xml_obj)

                if total_pages > 1:
                    for i in range(2, total_pages + 1):

                        self.__make_request(i)  # Get next page
                        utf8_parser2 = etree.XMLParser(encoding='utf-8')
                        xml = etree.parse(BytesIO(self.__raw_response),
                                          parser=utf8_parser2)
                        for obj in xml.getroot():
                            if obj.tag != 'pagination':
                                full_xml_obj = obj
                        # This is the actual element, now need to append a copy to the big one
                        for e in full_xml_obj:
                            combined_xml_obj.append(e)

                self.__xml_object = combined_xml_obj
                self.log_debug("Logging the combined xml object")
                self.log_debug(
                    etree.tostring(self.__xml_object,
                                   encoding='utf-8').decode('utf-8'))
                self.log("Request succeeded")
                return True
        elif self.__response_type in ['binary', 'png', 'csv']:
            self.log('Non XML response')
            return True
Пример #3
0
def parse_summary(folder_path, num_files, allowed_keys):

    num_files_read = 0
    raw_texts = {}

    for root, dirs, files in os.walk(folder_path):

        summary_file = "perdocs"

        if summary_file in files:
            with open(os.path.join(root, summary_file)) as f:
                it = itertools.chain('<root>', f, '</root>')

                parser = ET.XMLParser(encoding="us-ascii")
                root = ET.fromstringlist(it, parser=parser)

                text_tags = root.findall("SUM")

                for text_tag in text_tags:
                    if text_tag.get("DOCREF") in allowed_keys:
                        raw_texts[text_tag.get("DOCREF")] = text_tag.text

                    if raw_texts.keys() == allowed_keys:
                        break

        else:
            continue

    return raw_texts
def downloadXMLTV():

    cleanName = re.sub(r'[\<\>\:\"\/\\\|\?\*]', '_', str(jglob.name))
    cleanName = re.sub(r' ', '_', cleanName)
    cleanName = re.sub(r'_+', '_', cleanName)

    cleanNameOld = re.sub(r'[\<\>\:\"\/\\\|\?\* ]', '_', str(jglob.old_name))
    cleanNameOld = re.sub(r' ', '_', cleanNameOld)
    cleanNameOld = re.sub(r'_+', '_', cleanNameOld)

    jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanName) + '.xmltv.xml')
    jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanNameOld) + '.xmltv.xml')
    jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanName) + '.xmltv2.xml')
    jfunc.purge('/etc/epgimport', 'jmx.' + str(cleanNameOld) + '.xmltv2.xml')

    filepath = '/etc/epgimport/'
    epgfilename = 'jmx.' + str(cleanName) + '.xmltv.xml'
    epgpath = filepath + epgfilename
    response = downloads.checkGZIP(jglob.xmltv_address)

    if response is not None:

        with open(epgpath, 'w') as f:
            f.write(response)

        with open(epgpath, 'r') as f:
            # tree = ET.parse(f)
            tree = ET.parse(f, parser=ET.XMLParser(encoding='utf-8'))

        tree.write('/etc/epgimport/' + 'jmx.' + str(cleanName) + '.xmltv2.xml', encoding='utf-8', xml_declaration=True)
Пример #5
0
	def _get_herd_email(self, herd):
		"""Get a herd's email address.

		@type herd: str
		@param herd: herd whose email you want
		@rtype: str or None
		@return: email address or None if herd is not in herds.xml
		@raise IOError: if $PORTDIR/metadata/herds.xml can not be read
		"""

		if self._herdstree is None:
			try:
				self._herdstree = etree.parse(_unicode_encode(self._herds_path,
					encoding=_encodings['fs'], errors='strict'),
					parser=etree.XMLParser(target=_MetadataTreeBuilder()))
			except (ImportError, IOError, SyntaxError):
				return None

		# Some special herds are not listed in herds.xml
		if herd in ('no-herd', 'maintainer-wanted', 'maintainer-needed'):
			return None

		try:
			# Python 2.7 or >=3.2
			iterate = self._herdstree.iter
		except AttributeError:
			iterate = self._herdstree.getiterator

		for node in iterate('herd'):
			if node.findtext('name') == herd:
				return node.findtext('email')
Пример #6
0
def load_instances(f):
    '''
    Load two lists of cases to perform WSD on. The structure that is returned is a dict, where
    the keys are the ids, and the values are instances of WSDInstance.
    '''
    tree = ET.parse(f, ET.XMLParser(encoding="utf-8"))
    root = tree.getroot()

    dev_instances = {}
    test_instances = {}

    for text in root:
        if text.attrib['id'].startswith('d001'):
            instances = dev_instances
        else:
            instances = test_instances
        for sentence in text:
            # construct sentence context
            context = [el.attrib['lemma'] for el in sentence]
            for i, el in enumerate(sentence):
                if el.tag == 'instance':
                    my_id = el.attrib['id']
                    lemma = el.attrib['lemma']
                    instances[my_id] = WSDInstance(my_id, lemma, context, i)
    return dev_instances, test_instances
Пример #7
0
    def run(self, test_run_factory, **kwargs):
        """
        Entry point to start parsing the stream with which the parser
        was initialized.

        :param test_run_factory: A class from which to instantiate a
        "test_run" object whose add\*/set\* methods will be called as elements
        are found in the stream

        :returns: a SubmissionResult instance. This is not really used
        and seems redundant, as the data will be processed and stored by
        the TestRun instance (which is, however, also not returned anywhere).
        """
        parser = etree.XMLParser()

        tree = etree.parse(self.file, parser=parser)
        root = tree.getroot()
        if root.tag != "system":
            raise AssertionError(
                "Unexpected tag <%s> at root, expected <system>" % root.tag)

        result = SubmissionResult(test_run_factory, **kwargs)
        self.parseRoot(result, root)

        return result
Пример #8
0
    def obfuscate(self, obfuscation_info: Obfuscation):
        self.logger.info('Running "{0}" obfuscator'.format(
            self.__class__.__name__))

        try:
            # Change default namespace.
            Xml.register_namespace(
                "obfuscation", "http://schemas.android.com/apk/res/android")

            xml_parser = Xml.XMLParser(encoding="utf-8")
            manifest_tree = Xml.parse(obfuscation_info.get_manifest_file(),
                                      parser=xml_parser)
            manifest_root = manifest_tree.getroot()
            self.remove_xml_duplicates(manifest_root)
            self.scramble_xml_element(manifest_root)
            self.indent_xml(manifest_root)

            # Write the changes into the manifest file.
            manifest_tree.write(obfuscation_info.get_manifest_file(),
                                encoding="utf-8")

        except Exception as e:
            self.logger.error(
                'Error during execution of "{0}" obfuscator: {1}'.format(
                    self.__class__.__name__, e))
            raise

        finally:
            obfuscation_info.used_obfuscators.append(self.__class__.__name__)
Пример #9
0
	def fromstring(self, string):
		'''Set the contents of this tree from XML representation.'''
		parser = ElementTreeModule.XMLParser()
		parser.feed(string)
		root = parser.close()
		self._etree._setroot(root)
		return self # allow ParseTree().fromstring(..)
Пример #10
0
 def initfromfile(self):
     filename = botslib.abspathdata(self.ta_info['filename'])
     self.ta_info['attributemarker'] = '__'
     parser = ET.XMLParser()
     etree =  ET.ElementTree()   #ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed
     etreeroot = etree.parse(filename, parser)
     self.root = self._etree2botstree(etreeroot)  #convert etree to bots-nodes-tree
Пример #11
0
def svgMerge(box, inkscape, output):

    parser = ElementTree.XMLParser(remove_blank_text=True)

    src_tree = ElementTree.parse(box, parser)
    dest_tree = ElementTree.parse(inkscape, parser)
    dest_root = dest_tree.getroot()

    src_width, src_height = getSizeInMM(src_tree)
    dest_width, dest_height = getSizeInMM(dest_tree)

    src_scale_x, src_scale_y = ticksPerMM(src_tree)
    dest_scale_x, dest_scale_y = ticksPerMM(dest_tree)

    scale_x = dest_scale_x / src_scale_x
    scale_y = dest_scale_y / src_scale_y

    src_view = getViewBox(src_tree)

    off_x = src_view[0] * -scale_x
    off_y = (src_view[1]+src_view[3]) * -scale_y + dest_height * scale_y

    for el in src_tree.getroot():
        import sys
        dest_root.append(el)
        if el.tag.endswith("g"):
            el.set("transform", "matrix(%f,0,0,%f, %f, %f)" % (
                scale_x, scale_y, off_x, off_y))

    # write the xml file
    ElementTree.ElementTree(dest_root).write(output, pretty_print=True, encoding='utf-8', xml_declaration=True)
Пример #12
0
def load_booking_reviews():
    """


    :return:
    """
    import xml.etree.cElementTree as ET

    BOOKING_XML_FILE = os.path.join(DATA_DIR, "booking.xml")
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(BOOKING_XML_FILE, parser=parser)
    root = tree.getroot()

    reviews = []
    for item in root:
        for child in item:
            if child.tag == "Positivereview":
                review_text = child.text
                if review_text is not None:
                    if not isinstance(review_text, unicode):
                        review_text = review_text.decode('utf-8')
                    reviews.append(Review(review_text, REVIEW_MARK.POSITIVE))
            if child.tag == "Negativereview":
                review_text = child.text
                if review_text is not None:
                    if not isinstance(review_text, unicode):
                        review_text = review_text.decode('utf-8')
                    reviews.append(Review(review_text, REVIEW_MARK.NEGATIVE))
    return reviews
Пример #13
0
    def _getXmlDocumentItem(self, fileName):
        filePath = '%s/%s.xml' % (self._dbFolder, fileName)
        xmlDoc = self.xmlCache.get(filePath)
        if xmlDoc:
            return xmlDoc
        doc = None

        # try different encoding and configurations
        encodingArray = ['utf-8', 'iso-8859-5']
        for docEncoding in encodingArray:
            try:
                doc = ET.parse(filePath,
                               parser=ET.XMLParser(encoding=docEncoding))
                if doc is not None:
                    print('parse %s success. encoding = %s' %
                          (fileName, docEncoding))
                    break
            except:
                print('parse %s failed. encoding = %s' %
                      (fileName, docEncoding))
                # traceback.print_exc()
                continue
        if doc is None:
            print('parse %s failed' % (fileName, ))
            return XmlDocItem(None)

        xmlDoc = XmlDocItem(doc)
        self.xmlCache[filePath] = xmlDoc
        return xmlDoc
Пример #14
0
def read_news_from_xml_file():
    import xml.etree.cElementTree as ET
    parser = ET.XMLParser(encoding='utf-8')
    tree = ET.parse("files/newsafr.xml", parser)
    root = tree.getroot()
    news_description_generator = (
        x.text for x in root.findall('channel/item/description'))
    return news_description_generator
Пример #15
0
def extract_single(xml_path, html_path, md_path):
    print('File:  ', xml_path)
    tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8'))
    root = tree.getroot()
    print(root[0].tag)
    titles = extract_titles(html_path)
    for k in titles.keys():
        print(k, ' ', titles[k])
    recursively_extract_text(root, titles, md_path)
Пример #16
0
    def _xml_get(self, url, **params):
        response = self.session.get('%s%s' % (self.endpoint, url),
                                    headers={'Accept': 'application/xml'},
                                    params=params)
        response.raise_for_status()

        parser = etree.XMLParser(encoding='utf-8')
        parser.feed(response.text.encode('utf-8'))
        return parser.close()
Пример #17
0
def fromstringlist(sequence, parser=None):
    """
    Taken from Python2.7 source
    """
    if not parser:
        parser = ET.XMLParser(target=ET.TreeBuilder())
    for text in sequence:
        parser.feed(text)
    return parser.close()
Пример #18
0
def pretty_xmlfile(fname):
    from lxml import etree as ET
    parser = ET.XMLParser(
        remove_blank_text=False, resolve_entities=True, strip_cdata=True)
    xmlfile = ET.parse(fname, parser)
    pretty_xml = ET.tostring(
        xmlfile, encoding = 'UTF-8', xml_declaration = True, pretty_print = True)
    file = open(fname, "w")
    file.writelines(pretty_xml)
    file.close()
Пример #19
0
def parse_indexdirs(filein):
    "returns a dict of Scan objects, indexed by pfx"
    d = {}  # returned
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(filein, parser=parser)
    root = tree.getroot()
    for entry in root:
        rec = Scan(entry)
        d[rec.pfx] = rec
    return d
Пример #20
0
def load_file():
    import xml.etree.cElementTree as ET
    parser = ET.XMLParser(encoding='utf-8')
    tree = ET.parse("files/newsafr.xml", parser)
    root = tree.getroot()
    news_data = [
        x.text
        for x in root.findall('channel/item/description')
    ]
    return news_data
Пример #21
0
def parse_xml(path):
    """XML parsing with fail back to alternate encodings"""
    encodings = [None] + ALT_ENCODINGS[:]
    for e in encodings:
        p = ET.XMLParser(encoding=e)
        try:
            return ET.parse(path, parser=p)
        except ET.ParseError:
            pass
    return None
Пример #22
0
 def recvXML(self):
   """reads xml object from socket"""
   target = self.XMLHandler()
   parser = ET.XMLParser(target=target)
   while not target.done_parsing:
     parser.feed(self.connection.recv(1024))
   parser.close()
   if target.error != 0:
     print("Got error number {:} from WaveLabs software: {:}".format(target.error, target.error_message))
   return target
Пример #23
0
def autosave_can_be_parsed():
    """ Return True if the autosave file
    is writable and not corrupted.
    """
    global AUTOSAVE_FILE
    xmlp = ETree.XMLParser(encoding="utf-8")
    try:
        parser = ETree.parse(AUTOSAVE_FILE, xmlp)
        return True
    except Exception:
        return False
Пример #24
0
def xml_to_text(file_path):
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(file_path, parser)
    root = tree.getroot()

    description_list = root.findall("channel/item/description")
    all_words_list = []
    for description in description_list:
        for word in description.text.lower().split():
            if len(word) > 6:
                all_words_list.append(word)
    return all_words_list            
Пример #25
0
    def run(self, test_run_factory, **kwargs):
        parser = etree.XMLParser()

        tree = etree.parse(self.file, parser=parser)
        root = tree.getroot()
        if root.tag != "system":
            raise AssertionError(
                "Unexpected tag <%s> at root, expected <system>" % root.tag)

        result = SubmissionResult(test_run_factory, **kwargs)
        self.parseRoot(result, root)

        return result
Пример #26
0
    def encrypt_string_resources(self, string_resources_xml_file: str, string_names_to_encrypt: Set[str]):

        xml_parser = Xml.XMLParser(encoding='utf-8')
        xml_tree = Xml.parse(string_resources_xml_file, parser=xml_parser)

        for xml_string in xml_tree.iter('string'):
            string_name = xml_string.get('name', None)
            string_value = xml_string.text
            if string_name and string_value and string_name in string_names_to_encrypt:
                encrypted_string_value = self.encrypt_string(string_value)
                xml_string.text = encrypted_string_value

        xml_tree.write(string_resources_xml_file, encoding='utf-8')
Пример #27
0
    def get_elements(self):
        """
        get and parse xml/
        :return:  elements list
        """
        uri = self.rss_rules.get('uri')
        if uri.startswith('http'):
            uri = urlopen(uri)

        tree = etree.parse(uri, etree.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        element = ".//{0}".format(self.rule.get('elements'))
        items = root.findall(element)
        return items
Пример #28
0
def importminidump(filename, cur):
    printupdate("Importing minidump")

    fp = bz2.BZ2File(filename)
    parser = ET.XMLParser(target = MinidumpCB(cur))

    while True:
        data = fp.read(65536)
        if not data:
            break
        parser.feed(data)

    parser.close()
    fp.close()
Пример #29
0
    def encrypt_string_array_resources(self, string_array_resources_xml_file: str,
                                       string_array_names_to_encrypt: Set[str]):

        xml_parser = Xml.XMLParser(encoding='utf-8')
        xml_tree = Xml.parse(string_array_resources_xml_file, parser=xml_parser)

        for xml_string_array in xml_tree.iter('string-array'):
            string_array_name = xml_string_array.get('name', None)
            if string_array_name and string_array_name in string_array_names_to_encrypt:
                for item in xml_string_array.iter('item'):
                    if item.text:
                        encrypted_string_value = self.encrypt_string(item.text)
                        item.text = encrypted_string_value

        xml_tree.write(string_array_resources_xml_file, encoding='utf-8')
Пример #30
0
def parseheader(s):
    "s is a Scan object"
    pfx = s.pfx
    pfx1 = pfx.lower()
    headerfile = "../../%sScan/%s/downloads/%sheader.xml" % (s.pfx, s.year,
                                                             pfx1)
    #try:
    if True:
        #namespaces make finding harder with ET.
        # So, read the file as a string, remove namespace, then parse as string
        with codecs.open(headerfile, "r", "utf-8") as f:
            xmlstring0 = f.read()
        xmlns = 'xmlns="//www.tei-c.org/ns/1.0"'
        xmlstring = re.sub(xmlns, '', xmlstring0)
        with codecs.open("temp.xml", "w", "utf-8") as f:
            f.write(xmlstring)
        parser = ET.XMLParser(encoding="utf-8")
        #tree = ET.fromstring(xmlstring,parser=parser)
        tree = ET.parse("temp.xml", parser=parser)
    #except:
    # print "parseheader ERROR for ",headerfile
    # return
    root = tree.getroot()
    #root = tree
    node = root.find("teiHeader/fileDesc/titleStmt/title")  # first
    s.title = node.text
    """
 node = root.find("teiHeader/fileDesc")
 for child in node:
  print child.tag
 exit(1)
 """
    node = root.find("teiHeader/fileDesc/extent")  # first
    s.dsize = node.text  # size of digitization

    node = root.find("teiHeader/fileDesc/publicationStmt/date")  # first
    s.dyear = s.year  # year of digitization
    monogr = root.find("teiHeader/fileDesc/sourceDesc/biblStruct/monogr")
    #print "monogr=",monogr
    nodes = root.findall(
        "teiHeader/fileDesc/sourceDesc/biblStruct/monogr/author")  # first
    nodes1 = [node.text for node in nodes if node.text != None]
    s.authors = ','.join([name for name in nodes1 if name.strip() != ''])
    node = root.find(
        'teiHeader/fileDesc/sourceDesc/biblStruct/monogr/imprint/date')
    s.textdate = node.text
    node = root.find('teiHeader/fileDesc/sourceDesc/biblStruct/monogr/extent')
    s.textpages = node.text