def export(self, meta): ind = { 'indID': meta['indID'], 'name': self.name_for_fieldname(meta['fieldname']), 'units': meta['unit'] } Indicator(**ind).save() for item in self.extract(meta['fieldname']): if not item.get('region'): logging.warn("No region in {}".format(meta)) continue value = { 'dsID': dsID, 'region': item['region'], 'period': meta.get('period') or get_period(meta['fieldname']), 'value': item['value'], 'indID': meta['indID'], 'source': self.url, 'is_number': meta.get('is_number') or True } if value['region'] and value['value']: print value Value(**value).save()
code = table.filter(equal_to('Indicator Code')) years = code.fill(xypath.RIGHT) junction = indname.junction(years) for ind_cell, year_cell, value_cell in junction: vdict = dict(value) vdict['indID'] = ind_cell.value vdict['period'] = year_cell.value vdict['value'] = value_cell.value indicator = {'indID': vdict['indID']} nameunits = re.search('(.*)\((.*)\)', vdict['indID']) if nameunits: (indicator['name'], indicator['units']) = nameunits.groups() else: indicator['name'] = vdict['indID'] indicator['units'] = 'uno' Indicator(**indicator).save() v = Value(**vdict) if not v.is_blank(): v.save() print len(session.query(Value).filter(Value.dsID == 'World Bank').all()) session.commit() for country in getcountrylist(): try: getcountry(country) except Exception, e: print country, e raise
dataset_data = { 'dsID': 'unterm', 'last_updated': "", 'last_scraped': orm.now(), 'name': 'unterm' } DataSet(**dataset_data).save() indicator_data = [{ 'indID': 'unterm:' + i, 'name': i, 'units': '' } for i in indicators] for db_row in indicator_data: Indicator(**db_row).save() """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ value_static = {'dsID': 'unterm', 'period': '', 'is_number': False} def country_urls(): formdata = dict([[h.partition(':')[0], h.partition(':')[2]] for h in rawformdata.split('\n')]) url = 'http://unterm.un.org/DGAACS/unterm.nsf/0/$searchForm?SearchView=&Seq=1' html = requests.post(url, data=formdata).content root = lxml.html.fromstring(html) root.make_links_absolute(url)
def parse_rank(socrata_id, countries): for country in countries: if 'hdi_rank' in country: yield {"dsID": dsID, "region": country['country'], "period": 2012, # TODO "value": int(country['hdi_rank']), "indID": "PSE220", "source": data_url.format(socrata_id), "is_number": True} DataSet(**dataset).save() maxdate=None for socrata_code in lookup: ind = get_metadata(socrata_code) Indicator(**ind).save() for value in get_numbers(socrata_code): Value(**value).save() print "rank" ind = {"indID": "PSE220", "name": "HDI Rank", "units": "rank"} Indicator(**ind).save() for rank in get_rank("u2dx-y6wx"): Value(**rank).save()
def getindicator(ind="100106", overridefunction=None): if not overridefunction: baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind html = requests.get(baseurl).content else: html, baseurl = overridefunction() value = { 'dsID': 'HDRStats', 'indID': "HDR:" + ind, 'source': baseurl, 'is_number': True } dataset = { 'dsID': 'HDRStats', 'last_scraped': orm.now(), 'name': 'Human Development Indicators, UNDP' } indicator = {'indID': "HDR:" + ind} hdi_indicator = { 'indID': 'HDR:HDI Rank', 'name': 'Human Development Index rank', 'units': '' } Indicator(**hdi_indicator).save() DataSet(**dataset).save() print html exit(3) htmlio = StringIO.StringIO(html) messy = messytables.html.HTMLTableSet(htmlio) table = xypath.Table.from_messy(list(messy.tables)[0]) root = lxml.html.fromstring(html) "get odd indicator / update time" indicator_text = root.xpath("//h2/text()")[-1] print indicator_text try: indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text) except ValueError: indicator_split = [indicator_text, ""] indicator['name'], indicator['units'] = indicator_split indicator['name'] = indicator['name'].strip() access_text, = [ x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip() ] access_date_raw, = re.findall('Accessed:(.*)from', access_text) dataset['last_updated'] = dateutil.parser.parse( access_date_raw).isoformat() print dataset['last_updated'], indicator['name'], "*", indicator['units'] Indicator(**indicator).save() country_cell = table.filter("Country").assert_one() years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '') countries = country_cell.fill(xypath.DOWN) hdi_rank = table.filter("HDI Rank").assert_one() max_year = max(year.value for year in years) for i in countries.junction(hdi_rank): newvalue = dict(value) newvalue['indID'] = "HDR:HDI Rank" newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue[ 'period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear if newvalue['value'].strip() != '..': Value(**newvalue).save() for i in countries.junction(years): newvalue = dict(value) newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] = i[1].value.strip() if newvalue['value'].strip() != '..': Value(**newvalue).save() print newvalue session.commit()