def _parseXSEED(self, data): """ Parse a XML-SEED string. :type data: File pointer or StringIO object. """ data.seek(0) root = xmlparse(data).getroot() xseed_version = root.get('version') headers = root.getchildren() # Set all temporary attributes. self.temp = {'volume': [], 'abbreviations': [], 'stations': []} # Parse volume which is assumed to be the first header. Only parse # blockette 10 and discard the rest. self.temp['volume'].append( self._parseXMLBlockette(headers[0].getchildren()[0], 'V', xseed_version)) # Append all abbreviations. for blkt in headers[1].getchildren(): self.temp['abbreviations'].append( self._parseXMLBlockette(blkt, 'A', xseed_version)) # Append all stations. for control_header in headers[2:]: if not control_header.tag == 'station_control_header': continue self.temp['stations'].append([]) for blkt in control_header.getchildren(): self.temp['stations'][-1].append( self._parseXMLBlockette(blkt, 'S', xseed_version)) # Update internal values. self._updateInternalSEEDStructure()
def iter_parse(path): root = xmlparse(path).getroot() nsmap = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'api': 'http://sentic.net/api/', } for concept in root.iterfind('.//rdf:Description', nsmap): text = concept.find('./api:text', nsmap).text polarity = concept.find('./api:polarity', nsmap).text yield {'text': _normalize_term(text), 'polarity': float(polarity)}
def readFile(self): """ Reads a file and writes everything it finds to self.channel_lists. """ # Parse the file. Return if it could not be read. try: xml = xmlparse(self.file) except: return # Add some error handling. try: root = xml.getroot() except: return # This is the last check. Otherwise it is just assumed to be a correct # xml file. if root.tag != 'channel_lists': return # Get the lists. lists = root.getchildren() # If no lists are there return. if len(lists) == 0: return # Loop over each channel. for channel in lists: channels = channel.getchildren() # If there are no channels in the list remove it. if len(channels) == 0: continue list_name = channel.attrib['name'] channel_list = [] for item in channels: channel_list.append(item.text) # Now that all information if given write it to the dictionary. _i = 1 # Limit the while loop to 100 for savety reasons. while _i < 100: if _i == 1: cur_name = list_name else: cur_name = '%s_%i' % (list_name, _i) # Check if the name is alreadt in the dictionary, otherwise # increment the number. if cur_name in self.channel_lists: _i += 1 continue self.channel_lists[cur_name] = channel_list break
def get_mapa(): base_url = "http://www.rotadareciclagem.com.br/site.html?method=carregaEntidades&" options = "latMax=27.293068543847625&lngMax=84.66230031250007&latMin=-50.46827383595759&lngMin=-161.851566875&zoomAtual=14" #options = "latMax=-15.76194355063211&lngMax=-47.86326041431886&latMin=-15.80281903113728&lngMin=-47.98362851353151&zoomAtual=14" a = xmlparse(base_url + options).getroot() markers = a.findall('marker') for m in markers: data = {} data['lat'] = m.get('lat') data['lng'] = m.get('lng') data['id'] = m.get('id') data['type'] = m.get('prefixo') data['nome'] = m.text scraperwiki.sqlite.save(['id'], data)
def get_mapa(): base_url = "http://www.rotadareciclagem.com.br/site.html?method=carregaEntidades&" options = "latMax=27.293068543847625&lngMax=84.66230031250007&latMin=-50.46827383595759&lngMin=-161.851566875&zoomAtual=14" #options = "latMax=-15.76194355063211&lngMax=-47.86326041431886&latMin=-15.80281903113728&lngMin=-47.98362851353151&zoomAtual=14" a = xmlparse(base_url+options).getroot() markers = a.findall('marker') for m in markers: data = {} data['lat'] = m.get('lat') data['lng'] = m.get('lng') data['id'] = m.get('id') data['type'] = m.get('prefixo') data['nome'] = m.text scraperwiki.sqlite.save(['id'], data)
def normalize(iterable): log.info('Starting MOTI data normalization') file_stream = iterable_to_stream(iterable) et = xmlparse(file_stream) et = transform(et) obs_series = et.xpath("//observation-series") for series in obs_series: try: stn_id = series.xpath( "./origin/id[@type='client']")[0].text.strip() except IndexError as e: log.error("Could not detect the station id: xpath search " "'//observation-series/origin/id[@type='client']' " "return no results", extra={'exception': e}) continue members = series.xpath('./observation', namespaces=ns) for member in members: # get time and convert to datetime time = member.get('valid-time') if not time: log.warning("Could not find a valid-time attribute for this " "observation") continue tz = pytz.timezone('Canada/Pacific') try: date = dateparse(time).replace(tzinfo=tz) except ValueError as e: log.warning('Unable to convert value to datetime', extra={'time': time}) continue for obs in member.iterchildren(): variable_name = obs.get('type') if variable_name is None: continue try: value_element = obs.xpath('./value')[0] except IndexError as e: log.warning("Could not find the actual value for " "observation. xpath search './value' " "returned no results", extra={'variable_name': variable_name}) continue try: value = float(value_element.text) except ValueError: log.error("Could not convert value to a number. " "Skipping this observation.", extra={'value': value_element}) continue yield Row(time=date, val=value, variable_name=variable_name, unit=value_element.get('units'), network_name='MoTIe', station_id=stn_id, lat=None, lon=None)
# Standard module import pytz import logging # Installed libraries from pkg_resources import resource_filename from lxml.etree import XSLT, parse as xmlparse from dateutil.parser import parse as dateparse # Local from crmprtd import Row, iterable_to_stream xsl = resource_filename('crmprtd', 'data/moti.xsl') transform = XSLT(xmlparse(xsl)) ns = { 'xsi': "http://www.w3.org/2001/XMLSchema-instance" } log = logging.getLogger(__name__) def normalize(iterable): log.info('Starting MOTI data normalization') file_stream = iterable_to_stream(iterable) et = xmlparse(file_stream) et = transform(et) obs_series = et.xpath("//observation-series") for series in obs_series: try:
def prepare(self): self.doc = xmlparse(self.path) self.select() super(XMLFileTransaction,self).prepare()
def normalize(file_stream): log.info("Starting MOTI data normalization") et = xmlparse(file_stream) et = transform(et) obs_series = et.xpath("//observation-series") for series in obs_series: if not len(series): log.warning("Empty observation series: xpath search " "'//observation-series' return no results") continue try: stn_id = series.xpath( "./origin/id[@type='client']")[0].text.strip() except IndexError as e: log.error( "Could not detect the station id: xpath search " "'//observation-series/origin/id[@type='client']' " "return no results", extra={"exception": e}, ) continue members = series.xpath("./observation", namespaces=ns) for member in members: # get time and convert to datetime time = member.get("valid-time") if not time: log.warning("Could not find a valid-time attribute for this " "observation") continue try: # MoTI gives us an ISO formatted time string with # timezone info attached so it should be sufficient to # simply parse it and display it as UTC. date = dateparse(time).astimezone(pytz.utc) except ValueError as e: log.warning("Unable to convert value to datetime", extra={"time": time}) continue for obs in member.iterchildren(): variable_name = obs.get("type") if variable_name is None: continue try: value_element = obs.xpath("./value")[0] except IndexError as e: log.warning( "Could not find the actual value for " "observation. xpath search './value' " "returned no results", extra={"variable_name": variable_name}, ) continue try: value = float(value_element.text) except ValueError: log.error( "Could not convert value to a number. " "Skipping this observation.", extra={"value": value_element}, ) continue yield Row( time=date, val=value, variable_name=variable_name, unit=value_element.get("units"), network_name="MoTIe", station_id=stn_id, lat=None, lon=None, )
#!/usr/bin/env python # Standard module import pytz import logging # Installed libraries from pkg_resources import resource_filename from lxml.etree import XSLT, parse as xmlparse from dateutil.parser import parse as dateparse # Local from crmprtd import Row xsl = resource_filename("crmprtd", "data/moti.xsl") transform = XSLT(xmlparse(xsl)) ns = {"xsi": "http://www.w3.org/2001/XMLSchema-instance"} log = logging.getLogger(__name__) def normalize(file_stream): log.info("Starting MOTI data normalization") et = xmlparse(file_stream) et = transform(et) obs_series = et.xpath("//observation-series") for series in obs_series: if not len(series): log.warning("Empty observation series: xpath search " "'//observation-series' return no results") continue try: