def danmakuParser(xml: str) -> list: """ Parse xml from method `danmaku`. The map in the returned list will contain keys, which are "msg", "type", "fontsize", "color", "date", "rawdate","pool", "userid" and "danmakuid" . Especially, "userid" is special. :param xml: str :return: list """ tree = ET.fromstringlist(xml) danmakulist = [] for tag in tree: if tag.tag == 'd': info = {} info['msg'] = tag.text params = tag.get('p').split(',') info['time'] = float(params[0]) info['type'] = int(params[1]) info['fontsize'] = int(params[2]) info['color'] = hex(int(params[3])) info['date'] = Time.strftime('%Y-%m-%d %H:%M:%S', Time.localtime(int(params[4]))) info['timestamp'] = int(params[4]) info['pool'] = params[5] info['userid'] = params[6] info['danmuid'] = params[7] danmakulist.append(info) return danmakulist
def parse_summary(folder_path, num_files, allowed_keys): num_files_read = 0 raw_texts = {} for root, dirs, files in os.walk(folder_path): summary_file = "perdocs" if summary_file in files: with open(os.path.join(root, summary_file)) as f: it = itertools.chain('<root>', f, '</root>') parser = ET.XMLParser(encoding="us-ascii") root = ET.fromstringlist(it, parser=parser) text_tags = root.findall("SUM") for text_tag in text_tags: if text_tag.get("DOCREF") in allowed_keys: raw_texts[text_tag.get("DOCREF")] = text_tag.text if raw_texts.keys() == allowed_keys: break else: continue return raw_texts
def origin_metadata_get(self, project, package): meta = ET.fromstringlist( osc.core.show_package_meta(self.apiurl, project, package)) if meta is not None: return meta.get('project'), meta.get('name') return None, None
def analysis2list(xml_txt): try: import xml.etree.cElementTree as ET except: import xml.etree.ElementTree as ET try: root = ET.fromstringlist(xml_txt) res = [] for item in root.iter('item'): dict = {} for key in key_dic: iss_item = item.find(key) if iss_item is None: continue else: dict[key] = iss_item.text res.append(dict) issue_res = [] for issue_dic in res: issue = Issue(issue_dic) issue_res.append(issue) return issue_res except: print('analysis xml failure.') return None
def xml_to_dict(str_xml): """将xml转换为dict,只支持单层级的xml解析, 如:<xml><name>wukai</name><age>29</age></xml>""" dict = {} tree = ET.fromstringlist(str_xml) for ele in tree.getchildren(): dict[ele.tag] = ele.text return dict
def list_of_file(filename): with gzip.open(filename, "r") as f: parser = custom_xml_parser(encoding='utf-8') l = f.readlines() it = itertools.chain('<root>', [i.decode('utf-8') for i in l], '</root>') root = ET.fromstringlist(it, parser=parser) lists = [] doc_id = "" for element in root: headline = "" list1 = [] doc_id = element.attrib["id"] # print(element.attrib["id"]) for e in element: if (e.tag == 'HEADLINE'): headline = e.text.strip() list2 = [clean_text(i.text) for i in e if e.tag == 'TEXT'] if len(list2) > 0: # lists.append(doc_id, list2) lists.append( (headline, list2)) if len(headline) > 0 else lists.append( (doc_id, list2)) # if len(list1) > 0: # lists.append((headline, list1)) if len(headline) > 0 else lists.append((doc_id, list1)) return lists
def get_build_succeeded_packages(self, project): """Get the build succeeded packages from `from_prj` project. """ f = osc.core.show_prj_results_meta(self.apiurl, project) root = ET.fromstringlist(f) #print ET.dump(root) failed_multibuild_pacs = [] pacs = [] for node in root.findall('result'): if node.get('repository') == 'standard' and node.get('arch') == 'x86_64': for pacnode in node.findall('status'): if ':' in pacnode.get('package'): mainpac = pacnode.get('package').split(':')[0] if pacnode.get('code') not in ['succeeded', 'excluded']: failed_multibuild_pacs.append(pacnode.get('package')) if mainpac not in failed_multibuild_pacs: failed_multibuild_pacs.append(mainpac) if mainpac in pacs: pacs.remove(mainpac) else: if mainpac in failed_multibuild_pacs: failed_multibuild_pacs.append(pacnode.get('package')) elif mainpac not in pacs: pacs.append(mainpac) continue if pacnode.get('code') == 'succeeded': pacs.append(pacnode.get('package')) else: logging.error("Can not find standard/x86_64 results") return pacs
def sle_workarounds_unneeded_check(self, package): # If SLE-workarounds project and package was not sourced from # SLE-workarounds, but it does exist in SLE-workarounds. if (self.sle_workarounds and not self.sle_workarounds_sourced and package in self.packages[self.sle_workarounds]): # Determine how recently the package was updated. root = ET.fromstringlist( get_commitlog(self.apiurl, self.sle_workarounds, package, None, format='xml')) updated_last = date_parse(root.find('logentry/date').text) age = datetime.now() - updated_last if age.total_seconds() < 3600 * 24: logger.debug( 'skip removal of {}/{} since updated within 24 hours'. format(self.sle_workarounds, package)) return requests = get_request_list(self.apiurl, self.sle_workarounds, package, req_type='submit') if len(requests): logger.debug('existing submit request involving {}/{}'.format( self.sle_workarounds, package)) return self.delete_request( self.sle_workarounds, package, 'sourced from {}'.format(self.lookup.get(package)))
def project_meta_revision(apiurl, project): root = ET.fromstringlist( get_commitlog(apiurl, project, '_project', None, format='xml', meta=True)) return int(root.find('logentry').get('revision'))
def _parsexml(cls, response): # parse XML into ElementTree element try: return ElementTree.fromstringlist(response) except ParseError as e: raise LoonError( "Unable to parse response: {0}".format(e) )
def devel_project_get(apiurl, target_project, target_package): try: meta = ET.fromstringlist(show_package_meta(apiurl, target_project, target_package)) node = meta.find('devel') if node is not None: return node.get('project'), node.get('package') except HTTPError as e: if e.code != 404: raise e return None, None
def repository_path_expand(apiurl, project, repo): """Recursively list underlying projects.""" repos = [[project, repo]] meta = ET.fromstringlist(show_project_meta(apiurl, project)) paths = meta.findall('.//repository[@name="{}"]/path'.format(repo)) # The listed paths are taken as-is, except for the last one... for path in paths[:-1]: repos += [[path.get('project', project), path.get('repository')]] # ...which is expanded recursively if len(paths) > 0: repos += repository_path_expand(apiurl, paths[-1].get('project', project), paths[-1].get('repository')) return repos
def _repository_path_expand(apiurl, project, repo): """Recursively list underlying projects.""" repos = OrderedDict() meta = ET.fromstringlist(show_project_meta(apiurl, project)) for path in meta.findall('.//repository[@name="{}"]/path'.format(repo)): rp = repository_path_expand(apiurl, path.get('project', project), path.get('repository')) for project, repo in rp: # only the last repo for a project is remembered by OBS if project in repos: del repos[project] repos[project] = repo return repos
def get_tree(self, path): path = self.resolve_path(path) if path in self.trees: return self.trees[path] abspath = os.path.join(self.root, path) if not(os.path.exists(abspath) and os.path.isfile(abspath)): raise FailedCheck('File does not exist {!r}'.format(path)) with io.open(abspath, encoding='utf-8') as f: try: tree = ET.fromstringlist(f.readlines(), CustomHTMLParser()) except Exception as e: raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e)) self.trees[path] = tree return self.trees[path]
def readdata(path): Alldata = [] N = 0 files = listdir(path) for file in files: with open(path+'/'+file) as inputfile: File = inputfile.read() File = '<root>'+ File + '</root>' File = re.sub("[`&\"\']+", '', File) root = ET.fromstringlist(File) Docs = root.getchildren() for Doc in Docs: ID = Doc.find('DOCNO').text data = ET.tostring(Doc.find('TEXT')) if (len(data.split())<=2): continue Alldata.append((ID,data)) N = N +1 return Alldata, N
def __init__(self, xml_file=None, xml_str=None): """ 实例化 优先处理字符串传入rule :param xml_file: XML file xml文件路径 :param xml_str: XML str xml字符串 单行str文本 """ if xml_str is not None: if re.match('^<\?xml', xml_str) is None: xml_str = ''.join( ('<?xml version="1.0" encoding="utf-8" ?>', xml_str)) if type(xml_str) is list: super(XMLEtAnalyzer, self).__init__(element=fromstringlist(xml_str)) else: super(XMLEtAnalyzer, self).__init__(element=fromstring(xml_str)) else: super(XMLEtAnalyzer, self).__init__(file=xml_file)
def repository_path_expand(apiurl, project, repo, repos=None): """Recursively list underlying projects.""" if repos is None: # Avoids screwy behavior where list as default shares reference for all # calls which effectively means the list grows even when new project. repos = [] if [project, repo] in repos: # For some reason devel projects such as graphics include the same path # twice for openSUSE:Factory/snapshot. Does not hurt anything, but # cleaner not to include it twice. return repos repos.append([project, repo]) meta = ET.fromstringlist(show_project_meta(apiurl, project)) for path in meta.findall('.//repository[@name="{}"]/path'.format(repo)): repository_path_expand(apiurl, path.get('project', project), path.get('repository'), repos) return repos
def load_data(input_file): contents = [] passages = [] reader = open(input_file) text = '' line = reader.readline() while line: line = line.strip() if line.startswith("</SENTENCE>"): passages.append(line) sentence = ET.fromstringlist(passages) if not text: passages.clear() continue # text = sentence.findtext("TEXT") content = {"text": text, "mistakes": []} for mistake in sentence.iter("MISTAKE"): wrong = mistake.findtext("WRONG") correct = mistake.findtext("CORRECTION") if wrong == correct: continue reform = { "wrong": wrong, "correct": correct, "loc": mistake.findtext("LOCATION") } content["mistakes"].append(reform) if len(content["mistakes"]) > 0: contents.append(content) passages = [] text = '' elif line.startswith("<TEXT>"): text = line[len('<TEXT>'):-len('</TEXT>')] elif line: passages.append(line) line = reader.readline() reader.close() num = len(contents) print(f'{input_file} has loaded, total {num} records') return contents
def sle_workarounds_unneeded_check(self, package): # If SLE-workarounds project and package was not sourced from # SLE-workarounds, but it does exist in SLE-workarounds. if (self.sle_workarounds and not self.sle_workarounds_sourced and package in self.packages[self.sle_workarounds]): # Determine how recently the package was updated. root = ET.fromstringlist( get_commitlog(self.apiurl, self.sle_workarounds, package, None, format='xml')) updated_last = date_parse(root.find('logentry/date').text) age = datetime.now() - updated_last if age.total_seconds() < 3600 * 24: logger.debug('skip removal of {}/{} since updated within 24 hours'.format( self.sle_workarounds, package)) return requests = get_request_list(self.apiurl, self.sle_workarounds, package, req_type='submit') if len(requests): logger.debug('existing submit request involving {}/{}'.format(self.sle_workarounds, package)) return self.delete_request(self.sle_workarounds, package, 'sourced from {}'.format(self.lookup.get(package)))
def get_build_succeeded_packages(self, project): """Get the build succeeded packages from `from_prj` project. """ f = osc.core.show_prj_results_meta(self.apiurl, project) root = ET.fromstringlist(f) # print ET.dump(root) failed_multibuild_pacs = [] pacs = [] for node in root.findall('result'): if node.get('repository') == 'standard' and node.get( 'arch') == 'x86_64': for pacnode in node.findall('status'): if ':' in pacnode.get('package'): mainpac = pacnode.get('package').split(':')[0] if pacnode.get('code') not in [ 'succeeded', 'excluded' ]: failed_multibuild_pacs.append( pacnode.get('package')) if mainpac not in failed_multibuild_pacs: failed_multibuild_pacs.append(mainpac) if mainpac in pacs: pacs.remove(mainpac) else: if mainpac in failed_multibuild_pacs: failed_multibuild_pacs.append( pacnode.get('package')) elif mainpac not in pacs: pacs.append(mainpac) continue if pacnode.get('code') == 'succeeded': pacs.append(pacnode.get('package')) else: logging.error("Can not find standard/x86_64 results") return pacs
'no-cache', 'cookie': cookie, } r = requests.get(xml_url, headers=headers, params=params) xml_txt = r.text # print(xml_txt) try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET root = ET.fromstringlist(xml_txt) res = [] # Catch all issue about me for item in root.iter('item'): dict = {} for key in key_dic: iss_item = item.find(key) if iss_item is None: continue else: dict[key] = iss_item.text res.append(dict) # print(res)
def read_file(path, parse_headline=True, parse_dateline=True, parse_coreferences=True, parse_sentences=True, parse_text=True): with gzip.open(path) as source: source.readline() # file_line = source.readline() + "</FILE>" # file_tag = etree.fromstring(file_line) # file_id = file_tag.attrib['id'] lines = [] for line in source: lines.append(line) if line.strip() == '</DOC>': lines = ['<xml>'] + lines lines.append('</xml>') xml = etree.fromstringlist(lines).find('DOC') doc_id = xml.attrib['id'] date_str = doc_id.split('_')[-1].split('.')[0] date = parse_ymd(date_str) headline_xml = xml.find('HEADLINE') if headline_xml is not None and parse_headline: headline = parse_lisp(headline_xml.text.strip()) else: headline = None dateline_xml = xml.find('DATELINE') if dateline_xml is not None and parse_dateline: dateline = parse_lisp(dateline_xml.text.strip()) else: dateline = None coreferences = xml.find('coreferences') if coreferences is not None and parse_coreferences: coreferences = [[parse_mention(m) for m in x] for x in coreferences] else: coreferences = [] sentences = xml.find('sentences') if sentences is not None and parse_sentences: sentences = [parse_sentence(x) for x in xml.find('sentences')] else: sentences = [] text = xml.find('TEXT') if text is not None and parse_text: text = parse_text(text) else: text = None yield Document( id=xml.attrib['id'], date=date, type=xml.attrib['type'], headline=headline, dateline=dateline, text=text, sentences=sentences, coreferences=coreferences) lines = []
def read_file(path, parse_headline=True, parse_dateline=True, parse_coreferences=True, parse_sentences=True, parse_text=True, simple_token=True): with gzip.open(path, 'rt') as source: source.readline() # file_line = source.readline() + "</FILE>" # file_tag = etree.fromstring(file_line) # file_id = file_tag.attrib['id'] lines = [] for line in source: lines.append(line) if line.strip() == '</DOC>': lines = ['<xml>'] + lines lines.append('</xml>') xml = etree.fromstringlist(lines).find('DOC') doc_id = xml.attrib['id'] date_str = doc_id.split('_')[-1].split('.')[0] date = _parse_ymd(date_str) headline_xml = xml.find('HEADLINE') if headline_xml and parse_headline: headline = _parse_lisp(headline_xml.text.strip()) else: headline = None dateline_xml = xml.find('DATELINE') if dateline_xml and parse_dateline: dateline = _parse_lisp(dateline_xml.text.strip()) else: dateline = None coreferences = xml.find('coreferences') if coreferences and parse_coreferences: coreferences = [[_parse_mention(m) for m in x] for x in coreferences] else: coreferences = [] sentences = xml.find('sentences') if sentences and parse_sentences: sentences = [_parse_sentence(x, simple_token) for x in xml.find('sentences')] else: sentences = [] text = xml.find('TEXT') if text and parse_text: text = _parse_text(text) else: text = None yield Document( id=xml.attrib['id'], date=date, type=xml.attrib['type'], headline=headline, dateline=dateline, text=text, sentences=sentences, coreferences=coreferences) lines = []
def project_locked(apiurl, project): meta = ET.fromstringlist(show_project_meta(apiurl, project)) return meta.find('lock/enable') is not None
def project_meta_revision(apiurl, project): root = ET.fromstringlist(get_commitlog( apiurl, project, '_project', None, format='xml', meta=True)) return int(root.find('logentry').get('revision'))
def fetch(self, xml=None, sequence=None, **kwargs): """Get Blast record from url or file. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg xml: blast search results in XML format or an XML file that contains the results or a filename for saving the results or None :type xml: str :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int """ if self.isSuccess: LOGGER.warn( "The record already exists so not further search is performed") return True if sequence is None: sequence = self._sequence if xml is None: xml = self._xml import xml.etree.cElementTree as ET have_xml = False filename = None if xml is not None: if len(xml) < 100: # xml likely contains a filename if os.path.isfile(xml): # read the contents try: xml = ET.parse(xml) root = xml.getroot() have_xml = True except: raise ValueError('could not parse xml from xml file') else: # xml contains a filename for writing filename = xml else: try: if isinstance(xml, list): root = ET.fromstringlist(xml) elif isinstance(xml, str): root = ET.fromstring(xml) except: raise ValueError( 'xml is not a filename and does not look like' ' a valid XML string') else: have_xml = True if have_xml is False: # we still need to run a blast headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', self._timeout)) self._timeout = timeout try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info( 'Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect to NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == b'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return False LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') root = ET.XML(results) try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) root = dictElement(root, 'BlastOutput_') if root['db'] != 'pdb': raise ValueError('blast search database in xml must be "pdb"') if root['program'] != 'blastp': raise ValueError('blast search program in xml must be "blastp"') self._param = dictElement(root['param'][0], 'Parameters_') query_len = int(root['query-len']) if sequence and len(sequence) != query_len: raise ValueError('query-len and the length of the sequence do not ' 'match, xml data may not be for given sequence') hits = [] for iteration in root['iterations']: for hit in dictElement(iteration, 'Iteration_')['hits']: hit = dictElement(hit, 'Hit_') data = dictElement(hit['hsps'][0], 'Hsp_') for key in [ 'align-len', 'gaps', 'hit-frame', 'hit-from', 'hit-to', 'identity', 'positive', 'query-frame', 'query-from', 'query-to' ]: data[key] = int(data[key]) data['query-len'] = query_len for key in ['evalue', 'bit-score', 'score']: data[key] = float(data[key]) p_identity = 100.0 * data['identity'] / ( data['query-to'] - data['query-from'] + 1) data['percent_identity'] = p_identity p_overlap = (100.0 * (data['align-len'] - data['gaps']) / query_len) data['percent_coverage'] = p_overlap for item in (hit['id'] + hit['def']).split('>gi'): head, title = item.split(None, 1) head = head.split('|') pdb_id = head[-2].lower() chain_id = head[-1][:1] pdbch = dict(data) pdbch['pdb_id'] = pdb_id pdbch['chain_id'] = chain_id pdbch['title'] = (head[-1][1:] + title).strip() hits.append((p_identity, p_overlap, pdbch)) hits.sort(key=lambda hit: hit[0], reverse=True) self._hits = hits return True
def read_file(path, p_headline=True, p_dateline=True, p_coreferences=True, p_sentences=True, p_text=True): amp = re.compile(r'&', re.IGNORECASE) bamp = re.compile(r'&') with gzip.open(path) as source: # source.readline() # file_line = source.readline() + "</FILE>" # file_tag = etree.fromstring(file_line) # file_id = file_tag.attrib['id'] lines = [] for line in source: # fix ampersand escape lines.append(bamp.sub('&', amp.sub('&', line))) # lines.append(line) if line.strip() == '</DOC>': lines = ['<xml>'] + lines lines.append('</xml>') # print 80 * '=' # for ln in lines: # print ln # print 80 * '=' xml = etree.fromstringlist(lines).find('DOC') doc_id = xml.attrib['id'] date_str = doc_id.split('_')[-1].split('.')[0] date = parse_ymd(date_str) headline_xml = xml.find('HEADLINE') if headline_xml is not None and p_headline: headline = headline_xml.text.strip() else: headline = None dateline_xml = xml.find('DATELINE') if dateline_xml is not None and p_dateline: dateline = dateline_xml.text.strip() else: dateline = None coreferences = xml.find('coreferences') if coreferences is not None and p_coreferences: coreferences = [[parse_mention(m) for m in x] for x in coreferences] else: coreferences = [] sentences = xml.find('sentences') if sentences is not None and p_sentences: sentences = [parse_sentence(x) for x in xml.find('sentences')] else: sentences = [] text = xml.find('TEXT') if text is not None and p_text: text = parse_text(text) else: text = None yield Document( id=xml.attrib['id'], date=date, type=xml.attrib['type'], headline=headline, dateline=dateline, text=text, sentences=sentences, coreferences=coreferences) lines = []
import xml.etree.cElementTree as ET data = ''' <ComponentRoot name="ComponentLayout"> <Component name="RSBBStartEndCaseButton" style="StartEndCaseButtonStyle"> <Size name = "Default"> <Item name = "Group"> <Width>85.5</Width> <Height>50</Height> <Radius>2</Radius> </Item> <Item name="ButtonText" type = "text" translationProp = "Default"> <width>85.5</width> <Height>50</Height> <LeftMargin>0</LeftMargin> <TopMargin>0</TopMargin> <NumberLines>1</NumberLines> <FontSize>10</FontSize> <FontType>Normal</FontType> </Item> </Size> </Component> </ComponentRoot> ''' myroot = ET.fromstringlist(data) print(myroot.tag)