def save_dataset(): dataset = { "dsID": DSID, "last_updated": None, "last_scraped": orm.now(), "name": "United Nations Office on Drugs and Crime" } orm.DataSet(**dataset).save()
def main(): for sheet in spreadsheets: print sheet shortname = sheet.split('/')[-1].split('.')[0] dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0] year_text, = re.findall('\d{4}', dsID) dataset = { "dsID": dsID, "last_updated": year_text, "last_scraped": orm.now(), "name": "esa-unpd" } orm.DataSet(**dataset).save() indicator = {"indID": shortname, "name": shortname, "units": ''} # we replace the indicator name, so not saving now. # orm.Indicator(**indicator).save() value_template = {"dsID": dsID, "is_number": True, "source": sheet} raw = dl.grab(sheet) mtables = messytables.any.any_tableset(raw) names = [x.name for x in mtables.tables] if 'ESTIMATES' in names: mt = mtables['ESTIMATES'] else: mt = mtables['PROPORTION-URBAN'] table = xypath.Table.from_messy(mt) filestring = table.filter( re.compile("File[^:]*:.*")).assert_one().value indicator['name'], indicator['units'] = parse_file_string(filestring) print indicator['name'] orm.Indicator(**indicator).save() region_header = table.filter( re.compile("Major area, region, country or area.*")).assert_one() ccode_header = table.filter(re.compile("Country.code")).assert_one() regions = region_header.fill(xypath.DOWN) years = ccode_header.fill(xypath.RIGHT) for region_cell, year_cell, value_cell in regions.junction(years): value = dict(value_template) value['indID'] = indicator['indID'] value['region'] = region_cell.value year_value = year_cell.value if isinstance(year_value, basestring) and '-' in year_value: year1, _, year2 = year_value.partition('-') year_count = int(year2) - int(year1) assert year_count == 5 year_value = "%s/P%dY" % (year1, year_count) value['period'] = year_value value['value'] = value_cell.value orm.Value(**value).save() #print value orm.session.commit()
def doit(targets, names, year): # country_cells: we used to assert_one(), but sometimes there's two! country_cells = table.filter('iso').fill(xypath.DOWN) country_cells = country_cells - country_cells.filter('iso') # remove other if not country_cells: print "no countries" country_year_filter = country_cells.filter( lambda b: b.shift(xypath.RIGHT).value == year) if not country_year_filter: print "no countries for ", year target_cells = table.filter(lambda b: b.value in targets) if not target_cells: print "didn't find ", targets value = { 'dsID': 'emdat', 'period': "%s/P1Y" % (year), 'source': url, 'is_number': True } dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_dam': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value): value['region'] = cname one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname) for target_cell in target_cells: j = one_country_bag.junction(target_cell) value['indID'] = 'emdat:%s' % target_cell.value value['value'] = sum(int(x[2].value) for x in j) orm.Value(**value).save() print value orm.session.commit()
def doit(): # country_cells: we used to assert_one(), but sometimes there's two! dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_damage': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for country in country_list(): # TODO country_list print country raw = dl.grab(url.format(country)) m_tables = messytables.any.any_tableset(raw) mt, = m_tables.tables table = xypath.Table.from_messy(mt) yr = table.filter('year').assert_one() years = yr.fill(xypath.DOWN) cats = yr.fill(xypath.RIGHT) for year, cat, value in years.junction(cats): value = { 'dsID': 'emdat', 'region': country, 'indID': 'emdat:{}'.format(cat.value), 'period': '{}/P1Y'.format(year.value), 'value': value.value, 'source': url, 'is_number': True } orm.Value(**value).save() orm.session.commit()
mt, = messytables.any.any_tableset(handle).tables table = xypath.Table.from_messy(mt) country_anchor = table.filter("Country").assert_one() years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d")) countries = country_anchor.fill(xypath.DOWN) indicator = table.filter("Series").shift(xypath.DOWN).value SEPARATOR = ', ' if SEPARATOR in indicator: i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1]) i_unit = indicator.split(SEPARATOR)[-1] else: i_name = indicator i_unit = '' value_template['indID'] = indicator assert i_name indicator = {'indID': indicator, 'name': i_name, 'units': i_unit} orm.Indicator(**indicator).save() # countries also gets some rubbish, but junction will ignore it. for c_cell, y_cell, v_cell in countries.junction(years): value = dict(value_template) value['region'] = c_cell.value value['period'] = y_cell.value value['value'] = v_cell.value orm.Value(**value).save() orm.DataSet(**dataset).save() for ind in indicators: do_indicator(ind) orm.session.commit()
'REGION (DISPLAY)': 'x_supregion', 'GHO (DISPLAY)': 'indname' } """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ dataset_template = { "dsID": "athena-api", "last_updated": None, "last_scraped": orm.now(), "name": "WHO Athena API" } orm.DataSet(**dataset_template).save() indicators = list( "CM_01,CM_02,CM_03,DEVICES09,DEVICES22,HIV_0000000020,MALARIA001,MDG_0000000001,MDG_0000000003,MDG_0000000005,MDG_0000000005_AGE1519,MDG_0000000006,MDG_0000000006_AGE1519,MDG_0000000007,MDG_0000000010,MDG_0000000011,MDG_0000000013,MDG_0000000014,MDG_0000000017,MDG_0000000020,MDG_0000000023,MDG_0000000025,MDG_0000000025_AGE1519,MDG_0000000026,MDG_0000000029,MDG_0000000031,MDG_0000000033,MDG_0000000034,MH_17,MH_18,MH_6,SA_0000001688,TB_1,TB_tot_newrel,TOBACCO_0000000192,WHOSIS_000001,WHOSIS_000003,WHOSIS_000005,WHOSIS_000011,WHOSIS_000015,WHS10_1,WHS10_8,WHS10_9,WHS2_138,WHS2_162,WHS2_163,WHS2_164,WHS2_165,WHS2_166,WHS2_167,WHS2_168,WHS2_170,WHS2_171,WHS2_172,WHS2_173,WHS2_174,WHS2_3070_all,WHS2_3070_cancer,WHS2_3070_cdd,WHS2_3070_chronic,WHS2_513,WHS2_514,WHS2_515,WHS2_516,WHS2_523,WHS3_40,WHS3_41,WHS3_42,WHS3_43,WHS3_45,WHS3_46,WHS3_47,WHS3_48,WHS3_49,WHS3_50,WHS3_51,WHS3_52,WHS3_53,WHS3_55,WHS3_56,WHS3_57,WHS3_62,WHS4_100,WHS4_106,WHS4_107,WHS4_108,WHS4_111,WHS4_111_AGE1519,WHS4_115,WHS4_117,WHS4_124,WHS4_128,WHS4_129,WHS4_154,WHS4_2530,WHS6_101,WHS6_102,WHS6_116,WHS6_123,WHS6_125,WHS6_127,WHS6_136,WHS6_140,WHS6_144,WHS6_148,WHS6_150,WHS6_517,WHS6_518,WHS6_519,WHS6_520,WHS7_103,WHS7_104,WHS7_105,WHS7_108,WHS7_113,WHS7_120,WHS7_134,WHS7_139,WHS7_143,WHS7_147,WHS7_149,WHS7_156,WHS8_110,WHS9_85,WHS9_86,WHS9_88,WHS9_89,WHS9_90,WHS9_91,WHS9_92,WHS9_93,WHS9_95,WHS9_96,WHS9_97,WHS9_CBR,WHS9_CDR,WHS9_CS,CM_01,CM_02,CM_03,DEVICES09,DEVICES22,HIV_0000000020,MALARIA001,MDG_0000000001,MDG_0000000003,MDG_0000000005,MDG_0000000005_AGE1519,MDG_0000000006,MDG_0000000006_AGE1519,MDG_0000000007,MDG_0000000010,MDG_0000000011,MDG_0000000013,MDG_0000000014,MDG_0000000017,MDG_0000000020,MDG_0000000023,MDG_0000000025,MDG_0000000025_AGE1519,MDG_0000000026,MDG_0000000029,MDG_0000000031,MDG_0000000033,MDG_0000000034,MH_17,MH_18,MH_6,SA_0000001688,TB_1,TB_tot_newrel,TOBACCO_0000000192,WHOSIS_000001,WHOSIS_000003,WHOSIS_000005,WHOSIS_000011,WHOSIS_000015,WHS10_1,WHS10_8,WHS10_9,WHS2_138,WHS2_162,WHS2_163,WHS2_164,WHS2_165,WHS2_166,WHS2_167,WHS2_168,WHS2_170,WHS2_171,WHS2_172,WHS2_173,WHS2_174,WHS2_3070_all,WHS2_3070_cancer,WHS2_3070_cdd,WHS2_3070_chronic,WHS2_513,WHS2_514,WHS2_515,WHS2_516,WHS2_523,WHS3_40,WHS3_41,WHS3_42,WHS3_43,WHS3_45,WHS3_46,WHS3_47,WHS3_48,WHS3_49,WHS3_50,WHS3_51,WHS3_52,WHS3_53,WHS3_55,WHS3_56,WHS3_57,WHS3_62,WHS4_100,WHS4_106,WHS4_107,WHS4_108,WHS4_111,WHS4_111_AGE1519,WHS4_115,WHS4_117,WHS4_124,WHS4_128,WHS4_129,WHS4_154,WHS4_2530,WHS6_101,WHS6_102,WHS6_116,WHS6_123,WHS6_125,WHS6_127,WHS6_136,WHS6_140,WHS6_144,WHS6_148,WHS6_150,WHS6_517,WHS6_518,WHS6_519,WHS6_520,WHS7_103,WHS7_104,WHS7_105,WHS7_108,WHS7_113,WHS7_120,WHS7_134,WHS7_139,WHS7_143,WHS7_147,WHS7_149,WHS7_156,WHS8_110,WHS9_85,WHS9_86,WHS9_88,WHS9_89,WHS9_90,WHS9_91,WHS9_92,WHS9_93,WHS9_95,WHS9_96,WHS9_97,WHS9_CBR,WHS9_CDR,WHS9_CS,WHS9_86,WHS9_88,WHS9_89,WHS9_92,WHS9_96,WHS9_97,WHS9_90,WHS3_48,MALARIA002,MALARIA001,MALARIA003,MDG_0000000013,MDG_0000000014,MENING_2,MENING_1,MENING_3,CHOLERA_0000000001,CHOLERA_0000000002,CHOLERA_0000000003,MDG_0000000027,WHOSIS_000009,NUTRITION_564,sba,sba3,sba5,anc1,anc13,MDG_0000000015,MDG_0000000021,anc4,anc43,anc45" .split(",")) indicators = reversed(indicators) baseurl = "http://apps.who.int/gho/athena/data/GHO/%s.csv?profile=verbose" def units(s): # print repr(s) z = re.findall("(.*)\(([^)]*)\)$", s) if z: assert len(z) == 1
regions = "afr ant arc asi cac eur mea nam ocn sam".split(" ") for reg in regions: j = requests.get(baseindexurl % reg).json() for country in j['Countries']: yield { 'region': country['Code'], 'value': baseleafurl % (country['Code'], country['OfficialName']) } print list(accuweather()) orm.DataSet(dsID="accuweather", last_updated=None, last_scraped=orm.now(), name="Accuweather").save() orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save() valuetemplate = { 'dsID': 'accuweather', 'indID': 'accuweather_url', 'period': None, 'source': 'http://www.accuweather.com' } for datarow in accuweather(): olap_row = dict(valuetemplate) olap_row.update(datarow) orm.Value(**olap_row).save()