def do_jobs(): print "jobs" indID = "reliefweb_jobs" indicator = { 'indID': indID, 'name': "Number of jobs on ReliefWeb at specified time", 'units': 'uno' } orm.Indicator(**indicator).save() for country in countries: url = "http://api.rwlabs.org/v0/job/list" r = requests.get(url, data=get_job_query(country)) if 'data' not in r.json(): print r.json() print country continue value = { 'region': country, 'period': orm.now()[:10], # we don't need sub-day precision. 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True } orm.Value(**value).save()
def save_dataset(): dataset = { "dsID": DSID, "last_updated": None, "last_scraped": orm.now(), "name": "United Nations Office on Drugs and Crime" } orm.DataSet(**dataset).save()
def main(): for sheet in spreadsheets: print sheet shortname = sheet.split('/')[-1].split('.')[0] dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0] year_text, = re.findall('\d{4}', dsID) dataset = { "dsID": dsID, "last_updated": year_text, "last_scraped": orm.now(), "name": "esa-unpd" } orm.DataSet(**dataset).save() indicator = {"indID": shortname, "name": shortname, "units": ''} # we replace the indicator name, so not saving now. # orm.Indicator(**indicator).save() value_template = {"dsID": dsID, "is_number": True, "source": sheet} raw = dl.grab(sheet) mtables = messytables.any.any_tableset(raw) names = [x.name for x in mtables.tables] if 'ESTIMATES' in names: mt = mtables['ESTIMATES'] else: mt = mtables['PROPORTION-URBAN'] table = xypath.Table.from_messy(mt) filestring = table.filter( re.compile("File[^:]*:.*")).assert_one().value indicator['name'], indicator['units'] = parse_file_string(filestring) print indicator['name'] orm.Indicator(**indicator).save() region_header = table.filter( re.compile("Major area, region, country or area.*")).assert_one() ccode_header = table.filter(re.compile("Country.code")).assert_one() regions = region_header.fill(xypath.DOWN) years = ccode_header.fill(xypath.RIGHT) for region_cell, year_cell, value_cell in regions.junction(years): value = dict(value_template) value['indID'] = indicator['indID'] value['region'] = region_cell.value year_value = year_cell.value if isinstance(year_value, basestring) and '-' in year_value: year1, _, year2 = year_value.partition('-') year_count = int(year2) - int(year1) assert year_count == 5 year_value = "%s/P%dY" % (year1, year_count) value['period'] = year_value value['value'] = value_cell.value orm.Value(**value).save() #print value orm.session.commit()
def main(): for sheet in spreadsheets: shortname = sheet.split('/')[-1].split('.')[0] dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0] year_text, = re.findall('\d{4}', dsID) dataset = {"dsID": dsID, "last_updated": year_text, "last_scraped": orm.now(), "name": "esa-unpd"} orm.DataSet(**dataset).save() indicator = {"indID": shortname, "name": shortname, "units": '' } # we replace the indicator name, so not saving now. # orm.Indicator(**indicator).save() value_template = {"dsID": dsID, "is_number": True, "source": sheet} raw = dl.grab(sheet) mtables = messytables.any.any_tableset(raw) names = [x.name for x in mtables.tables] if 'ESTIMATES' in names: mt = mtables['ESTIMATES'] else: mt = mtables['PROPORTION-URBAN'] table = xypath.Table.from_messy(mt) filestring = table.filter(re.compile("File[^:]*:.*")).assert_one().value indicator['name'], indicator['units'] = parse_file_string(filestring) print indicator['name'] orm.Indicator(**indicator).save() region_header = table.filter(re.compile("Major area, region, country or area.*")).assert_one() ccode_header = table.filter(re.compile("Country.code")).assert_one() regions = region_header.fill(xypath.DOWN) years = ccode_header.fill(xypath.RIGHT) for region_cell, year_cell, value_cell in regions.junction(years): value = dict(value_template) value['indID'] = indicator['indID'] value['region'] = region_cell.value year_value = year_cell.value if isinstance(year_value, basestring) and '-' in year_value: year1, _, year2 = year_value.partition('-') year_count = int(year2) - int(year1) assert year_count == 5 year_value = "%s/P%dY" % (year1, year_count) value['period'] = year_value value['value'] = value_cell.value orm.Value(**value).save() #print value orm.session.commit()
def doit(targets, names, year): # country_cells: we used to assert_one(), but sometimes there's two! country_cells = table.filter('iso').fill(xypath.DOWN) country_cells = country_cells - country_cells.filter('iso') # remove other if not country_cells: print "no countries" country_year_filter = country_cells.filter( lambda b: b.shift(xypath.RIGHT).value == year) if not country_year_filter: print "no countries for ", year target_cells = table.filter(lambda b: b.value in targets) if not target_cells: print "didn't find ", targets value = { 'dsID': 'emdat', 'period': "%s/P1Y" % (year), 'source': url, 'is_number': True } dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_dam': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value): value['region'] = cname one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname) for target_cell in target_cells: j = one_country_bag.junction(target_cell) value['indID'] = 'emdat:%s' % target_cell.value value['value'] = sum(int(x[2].value) for x in j) orm.Value(**value).save() print value orm.session.commit()
def do_jobs(): print "jobs" indID = "reliefweb_jobs" indicator = {'indID': indID, 'name': "Number of jobs on ReliefWeb at specified time", 'units': 'uno'} orm.Indicator(**indicator).save() for country in countries: url = "http://api.rwlabs.org/v0/job/list" r = requests.get(url, data=get_job_query(country)) if 'data' not in r.json(): print r.json() print country continue value = {'region': country, 'period': orm.now()[:10], # we don't need sub-day precision. 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True} orm.Value(**value).save()
def doit(): # country_cells: we used to assert_one(), but sometimes there's two! dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_damage': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for country in country_list(): # TODO country_list print country raw = dl.grab(url.format(country)) m_tables = messytables.any.any_tableset(raw) mt, = m_tables.tables table = xypath.Table.from_messy(mt) yr = table.filter('year').assert_one() years = yr.fill(xypath.DOWN) cats = yr.fill(xypath.RIGHT) for year, cat, value in years.junction(cats): value = { 'dsID': 'emdat', 'region': country, 'indID': 'emdat:{}'.format(cat.value), 'period': '{}/P1Y'.format(year.value), 'value': value.value, 'source': url, 'is_number': True } orm.Value(**value).save() orm.session.commit()
def doit(targets, names, year): # country_cells: we used to assert_one(), but sometimes there's two! country_cells = table.filter('iso').fill(xypath.DOWN) country_cells = country_cells - country_cells.filter('iso') # remove other if not country_cells: print "no countries" country_year_filter = country_cells.filter(lambda b: b.shift(xypath.RIGHT).value == year) if not country_year_filter: print "no countries for ", year target_cells = table.filter(lambda b: b.value in targets) if not target_cells: print "didn't find ", targets value = {'dsID': 'emdat', 'period': "%s/P1Y" % (year), 'source': url, 'is_number': True} dataset = {'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT'} orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_dam': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value): value['region'] = cname one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname) for target_cell in target_cells: j = one_country_bag.junction(target_cell) value['indID'] = 'emdat:%s' % target_cell.value value['value'] = sum(int(x[2].value) for x in j) orm.Value(**value).save() print value orm.session.commit()
def doit(): # country_cells: we used to assert_one(), but sometimes there's two! dataset = {'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT'} orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_damage': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for country in country_list(): # TODO country_list print country raw = dl.grab(url.format(country)) m_tables = messytables.any.any_tableset(raw) mt, = m_tables.tables table = xypath.Table.from_messy(mt) yr = table.filter('year').assert_one() years = yr.fill(xypath.DOWN) cats = yr.fill(xypath.RIGHT) for year, cat, value in years.junction(cats): value = {'dsID': 'emdat', 'region': country, 'indID': 'emdat:{}'.format(cat.value), 'period': '{}/P1Y'.format(year.value), 'value': value.value, 'source': url, 'is_number': True} orm.Value(**value).save() orm.session.commit()
"665", # improved water "668", # improved sanitation "553", # maternal mortality "561", # under 5 mortality "589", # primary education ratio "559", # severely underweight "755", # } "756", # } telecoms x3 "605", # } "640", # energy consumption ] dataset = { "dsID": "mdgs", "last_updated": None, "last_scraped": orm.now(), "name": "Millennium Development Goals" } value_template = {"dsID": "mdgs", "is_number": True} def do_indicator(ind="566"): baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s" url = baseurl % ind value_template['source'] = url handle = dl.grab(url) mt, = messytables.any.any_tableset(handle).tables table = xypath.Table.from_messy(mt) country_anchor = table.filter("Country").assert_one() years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
Economy Transport Education Demographics Religion """.strip().lower().split('\n') import orm """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ dataset = {'dsID': 'wikipedia', 'last_updated': None, # TODO 'last_scraped': orm.now(), 'name': 'Wikipedia'} orm.DataSet(**dataset).save() for h in headers: indicator = {'indID': 'wikipedia:' + h, 'name': 'Wikipedia: ' + h, 'units': 'url'} orm.Indicator(**indicator).save() value_template = {'dsID': 'wikipedia', 'period': None, 'is_number': False}
import re import datetime import requests from orm import session, Value, DataSet, Indicator import orm """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ dsID = "data.undp.org" dataset = {"dsID": dsID, "last_updated": None, # TODO max(pubdate) "last_scraped": orm.now(), "name": "UNDP Open Data"} metadata_url = "https://data.undp.org/api/views/{}/rows.json?accessType=DOWNLOAD" data_url = "http://data.undp.org/resource/{}.json" lookup = {"u2dx-y6wx": "PSE110", # GNI per capita in PPP terms (constant 2005 international $) "bkr7-unqh": "PVE010", # Public expenditure on education (% of GDP) (%) "m67k-vi5c": "PVE110", # Mean years of schooling (of adults)|years "jbhn-xkjv": "PVE120", # Combined gross enrolment in education (both sexes) "ehe9-pgud": "PSE160", # MPI: Population living below $1.25 PPP per day (%) "a4ay-qce2": "PVH120", # Under-five mortality "bh77-rzbn": "HDR:68606", # GII: Gender Inequality Index, value "qnam-f624": "PVE030", # Expected Year of Schooling (of children) "4gkx-mq89": "PVH180", # Maternal mortality ratio "x22y-8m6h": "PVE040", # Adult literacy rate, both sexes (% aged 15 and above) # "---------": "------", # Impact of natural disasters: number of deaths
def getindicator(ind="100106", overridefunction=None): if not overridefunction: baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind html = requests.get(baseurl).content else: html, baseurl = overridefunction() value = { 'dsID': 'HDRStats', 'indID': "HDR:" + ind, 'source': baseurl, 'is_number': True } dataset = { 'dsID': 'HDRStats', 'last_scraped': orm.now(), 'name': 'Human Development Indicators, UNDP' } indicator = {'indID': "HDR:" + ind} hdi_indicator = { 'indID': 'HDR:HDI Rank', 'name': 'Human Development Index rank', 'units': '' } Indicator(**hdi_indicator).save() DataSet(**dataset).save() print html exit(3) htmlio = StringIO.StringIO(html) messy = messytables.html.HTMLTableSet(htmlio) table = xypath.Table.from_messy(list(messy.tables)[0]) root = lxml.html.fromstring(html) "get odd indicator / update time" indicator_text = root.xpath("//h2/text()")[-1] print indicator_text try: indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text) except ValueError: indicator_split = [indicator_text, ""] indicator['name'], indicator['units'] = indicator_split indicator['name'] = indicator['name'].strip() access_text, = [ x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip() ] access_date_raw, = re.findall('Accessed:(.*)from', access_text) dataset['last_updated'] = dateutil.parser.parse( access_date_raw).isoformat() print dataset['last_updated'], indicator['name'], "*", indicator['units'] Indicator(**indicator).save() country_cell = table.filter("Country").assert_one() years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '') countries = country_cell.fill(xypath.DOWN) hdi_rank = table.filter("HDI Rank").assert_one() max_year = max(year.value for year in years) for i in countries.junction(hdi_rank): newvalue = dict(value) newvalue['indID'] = "HDR:HDI Rank" newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue[ 'period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear if newvalue['value'].strip() != '..': Value(**newvalue).save() for i in countries.junction(years): newvalue = dict(value) newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] = i[1].value.strip() if newvalue['value'].strip() != '..': Value(**newvalue).save() print newvalue session.commit()
def yeartotimestamp(year): d = datetime.datetime(year=year, month=1, day=1) return int(d.strftime('%s')) def getcountrylist(): for value in orm.session.query( orm.Value).filter(orm.Value.indID == "CG060").all(): yield value.region dsID = "reliefweb-api" dataset = { 'dsID': dsID, 'last_updated': orm.now(), 'last_scraped': orm.now(), 'name': "ReliefWeb API" } orm.DataSet(**dataset).save() ocha_products = """Situation Report Humanitarian Bulletin Humanitarian Dashboard Humanitarian Snapshot Key Messages Press Release Press Review Statement/Speech Other
baseindexurl = "http://www.accuweather.com/ajax-service/getcountrylist?region=%s&languageID=1" baseleafurl = "http://www.accuweather.com/en/%s/%s-weather" regions = "afr ant arc asi cac eur mea nam ocn sam".split(" ") for reg in regions: j = requests.get(baseindexurl % reg).json() for country in j['Countries']: yield {'region': country['Code'], 'value': baseleafurl % (country['Code'], country['OfficialName'])} print list(accuweather()) orm.DataSet(dsID="accuweather", last_updated=None, last_scraped=orm.now(), name="Accuweather").save() orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save() valuetemplate = {'dsID': 'accuweather', 'indID': 'accuweather_url', 'period': None, 'source': 'http://www.accuweather.com'} for datarow in accuweather(): olap_row = dict(valuetemplate) olap_row.update(datarow) orm.Value(**olap_row).save()
for reg in regions: j = requests.get(baseindexurl % reg).json() for country in j['Countries']: yield { 'region': country['Code'], 'value': baseleafurl % (country['Code'], country['OfficialName']) } print list(accuweather()) orm.DataSet(dsID="accuweather", last_updated=None, last_scraped=orm.now(), name="Accuweather").save() orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save() valuetemplate = { 'dsID': 'accuweather', 'indID': 'accuweather_url', 'period': None, 'source': 'http://www.accuweather.com' } for datarow in accuweather(): olap_row = dict(valuetemplate) olap_row.update(datarow) orm.Value(**olap_row).save()
""" def yeartotimestamp(year): d = datetime.datetime(year=year, month=1, day=1) return int(d.strftime('%s')) def getcountrylist(): for value in orm.session.query(orm.Value).filter(orm.Value.indID == "CG060").all(): yield value.region dsID = "reliefweb-api" dataset = {'dsID': dsID, 'last_updated': orm.now(), 'last_scraped': orm.now(), 'name': "ReliefWeb API"} orm.DataSet(**dataset).save() ocha_products = """Situation Report Humanitarian Bulletin Humanitarian Dashboard Humanitarian Snapshot Key Messages Press Release Press Review Statement/Speech Other Thematic Map
def getindicator(ind="100106", overridefunction=None): if not overridefunction: baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind html = requests.get(baseurl).content else: html, baseurl = overridefunction() value = {'dsID': 'HDRStats', 'indID': "HDR:"+ind, 'source': baseurl, 'is_number': True} dataset = {'dsID': 'HDRStats', 'last_scraped': orm.now(), 'name': 'Human Development Indicators, UNDP'} indicator = {'indID': "HDR:"+ind} hdi_indicator = {'indID': 'HDR:HDI Rank', 'name': 'Human Development Index rank', 'units': ''} Indicator(**hdi_indicator).save() DataSet(**dataset).save() print html exit(3) htmlio = StringIO.StringIO(html) messy = messytables.html.HTMLTableSet(htmlio) table = xypath.Table.from_messy(list(messy.tables)[0]) root = lxml.html.fromstring(html) "get odd indicator / update time" indicator_text = root.xpath("//h2/text()")[-1] print indicator_text try: indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text) except ValueError: indicator_split = [indicator_text, ""] indicator['name'], indicator['units'] = indicator_split indicator['name'] = indicator['name'].strip() access_text, = [x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip()] access_date_raw, = re.findall('Accessed:(.*)from', access_text) dataset['last_updated'] = dateutil.parser.parse(access_date_raw).isoformat() print dataset['last_updated'], indicator['name'], "*", indicator['units'] Indicator(**indicator).save() country_cell = table.filter("Country").assert_one() years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '') countries = country_cell.fill(xypath.DOWN) hdi_rank = table.filter("HDI Rank").assert_one() max_year = max(year.value for year in years) for i in countries.junction(hdi_rank): newvalue = dict(value) newvalue['indID'] = "HDR:HDI Rank" newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear if newvalue['value'].strip() != '..': Value(**newvalue).save() for i in countries.junction(years): newvalue = dict(value) newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] =i[1].value.strip() if newvalue['value'].strip() != '..': Value(**newvalue).save() print newvalue session.commit()