Пример #1
0
def do_jobs():
    print "jobs"
    indID = "reliefweb_jobs"
    indicator = {
        'indID': indID,
        'name': "Number of jobs on ReliefWeb at specified time",
        'units': 'uno'
    }
    orm.Indicator(**indicator).save()
    for country in countries:
        url = "http://api.rwlabs.org/v0/job/list"
        r = requests.get(url, data=get_job_query(country))
        if 'data' not in r.json():
            print r.json()
            print country
            continue
        value = {
            'region': country,
            'period': orm.now()[:10],  # we don't need sub-day precision.
            'value': r.json()['data']['total'],
            'dsID': dsID,
            'indID': indID,
            'source': url,
            'is_number': True
        }
        orm.Value(**value).save()
Пример #2
0
def do_indicator(ind="566"):
    baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s"
    url = baseurl % ind
    value_template['source'] = url
    handle = dl.grab(url)
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
    countries = country_anchor.fill(xypath.DOWN)
    indicator = table.filter("Series").shift(xypath.DOWN).value
    SEPARATOR = ', '
    if SEPARATOR in indicator:
        i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1])
        i_unit = indicator.split(SEPARATOR)[-1]
    else:
        i_name = indicator
        i_unit = ''
    value_template['indID'] = indicator
    assert i_name
    indicator = {'indID': indicator, 'name': i_name, 'units': i_unit}
    orm.Indicator(**indicator).save()
    # countries also gets some rubbish, but junction will ignore it.
    for c_cell, y_cell, v_cell in countries.junction(years):
        value = dict(value_template)
        value['region'] = c_cell.value
        value['period'] = y_cell.value
        value['value'] = v_cell.value
        orm.Value(**value).save()
def getstats(url, country="PLACEHOLDER"):
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    for mt in mts.tables:
        table = xypath.Table.from_messy(mt)
        inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value)
        if not inds:
            continue
        assert len(inds) == 1
        top, = table.filter(lambda b: 'to the top' in b.value)
        value, = inds.junction(top)
        for ind in inds:
            split = split_ind(ind.value)
            values_tosave = dict(value_template)
            values_tosave['source'] = url
            values_tosave['region'] = country
            values_tosave['value'] = value[2].value
            indicator = {
                'indID': split['indID'],
                'name': split['indID'],
                'units': split['units']
            }
            orm.Indicator(**indicator).save()
            values_tosave['indID'] = split['indID']
            orm.Value(**values_tosave).save()
            saves = saves + 1
    if saves != 1:
        print "huh, %d saves for %r" % (saves, url)
Пример #4
0
def main():
    for sheet in spreadsheets:
        print sheet
        shortname = sheet.split('/')[-1].split('.')[0]
        dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
        year_text, = re.findall('\d{4}', dsID)
        dataset = {
            "dsID": dsID,
            "last_updated": year_text,
            "last_scraped": orm.now(),
            "name": "esa-unpd"
        }

        orm.DataSet(**dataset).save()
        indicator = {"indID": shortname, "name": shortname, "units": ''}
        # we replace the indicator name, so not saving now.
        # orm.Indicator(**indicator).save()
        value_template = {"dsID": dsID, "is_number": True, "source": sheet}

        raw = dl.grab(sheet)
        mtables = messytables.any.any_tableset(raw)
        names = [x.name for x in mtables.tables]
        if 'ESTIMATES' in names:
            mt = mtables['ESTIMATES']
        else:
            mt = mtables['PROPORTION-URBAN']
        table = xypath.Table.from_messy(mt)

        filestring = table.filter(
            re.compile("File[^:]*:.*")).assert_one().value
        indicator['name'], indicator['units'] = parse_file_string(filestring)
        print indicator['name']
        orm.Indicator(**indicator).save()

        region_header = table.filter(
            re.compile("Major area, region, country or area.*")).assert_one()
        ccode_header = table.filter(re.compile("Country.code")).assert_one()
        regions = region_header.fill(xypath.DOWN)
        years = ccode_header.fill(xypath.RIGHT)
        for region_cell, year_cell, value_cell in regions.junction(years):
            value = dict(value_template)
            value['indID'] = indicator['indID']
            value['region'] = region_cell.value
            year_value = year_cell.value
            if isinstance(year_value, basestring) and '-' in year_value:
                year1, _, year2 = year_value.partition('-')
                year_count = int(year2) - int(year1)
                assert year_count == 5
                year_value = "%s/P%dY" % (year1, year_count)
            value['period'] = year_value
            value['value'] = value_cell.value
            orm.Value(**value).save()
            #print value
    orm.session.commit()
Пример #5
0
def do_indicator(ind):
    print "indicator:", ind
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    try:
        headers = mt_list[0]
    except IndexError:
        headers = []
    if len(headers) == 0:
        print "Error getting headers from ", ind
        raise RuntimeError("No header in {}".format(ind))
    logging.warn("headers {!r}".format(headers))
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        try:
            name, unit = units(rowdict['GHO (DISPLAY)'])
        except Exception:
            fh.seek(0)
            print fh.read()
            raise
        indID = rowdict['GHO (CODE)']
        for lookup in [
                "SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE"
        ]:
            lookup_code = lookup + " (CODE)"
            lookup_name = lookup + " (DISPLAY)"
            if lookup_code in rowdict:  # header = "SEX (CODE)"
                if rowdict[lookup_code]:  # value != ""
                    indID = indID + "({}={})".format(lookup,
                                                     rowdict[lookup_code])
                    name = name + " - " + rowdict[lookup_name]

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
Пример #6
0
def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(
        lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {
        'dsID': 'emdat',
        'period': "%s/P1Y" % (year),
        'source': url,
        'is_number': True
    }

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_dam':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    for cname, one_country_cells in itertools.groupby(country_year_filter,
                                                      lambda b: b.value):
        value['region'] = cname
        one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname)
        for target_cell in target_cells:
            j = one_country_bag.junction(target_cell)
            value['indID'] = 'emdat:%s' % target_cell.value
            value['value'] = sum(int(x[2].value) for x in j)
            orm.Value(**value).save()
            print value
    orm.session.commit()
Пример #7
0
def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()

    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {
                'dsID': 'emdat',
                'region': country,
                'indID': 'emdat:{}'.format(cat.value),
                'period': '{}/P1Y'.format(year.value),
                'value': value.value,
                'source': url,
                'is_number': True
            }
            orm.Value(**value).save()
    orm.session.commit()
Пример #8
0
def do_products():
    for product in ocha_products:
        print product
        niceproduct = product.replace(" ", "_")
        indID = "reliefweb_" + niceproduct
        indicator = {
            'indID': indID,
            'name':
            "Number of ReliefWeb reports flagged with ocha_product: %s" %
            product,
            'units': 'uno'
        }
        orm.Indicator(**indicator).save()

        for country in countries:
            for year in range(1990, 2014):
                params = dict()
                params['PRODUCT'] = product
                params['COUNTRY'] = country
                params['FROM'] = 1000 * yeartotimestamp(year)
                params['TO'] = 1000 * yeartotimestamp(year + 1) - 1
                url = "http://api.rwlabs.org/v0/report/list"
                r = requests.get(url, data=get_product_query(**params))
                if 'data' not in r.json():
                    print r.json()
                    print country
                    continue
                value = {
                    'region': country,
                    'period': str(year),
                    'value': r.json()['data']['total'],
                    'dsID': dsID,
                    'indID': indID,
                    'source': url,
                    'is_number': True
                }
                orm.Value(**value).save()
Пример #9
0
def do_indicator(ind):
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    headers = mt_list[0]
    if len(headers) == 0:
        print ind
        exit()
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        name, unit = units(rowdict['GHO (DISPLAY)'])
        if 'SEX (CODE)' in rowdict:
            indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)']
            name = name + " - " + rowdict['SEX (DISPLAY)']
        else:
            indID = rowdict['GHO (CODE)']

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
        print value_dict
Пример #10
0
   DataSet: dsID, last_updated, last_scraped, name
      Indicator: indID, name, units
         """

dataset = {'dsID': 'wikipedia',
           'last_updated': None,  # TODO
           'last_scraped': orm.now(),
           'name': 'Wikipedia'}

orm.DataSet(**dataset).save()

for h in headers:
    indicator = {'indID': 'wikipedia:' + h,
                 'name': 'Wikipedia: ' + h,
                 'units': 'url'}
    orm.Indicator(**indicator).save()

value_template = {'dsID': 'wikipedia',
                  'period': None,
                  'is_number': False}


def exact_match(level, header, country):
    matches = [x.lower() for x in country[level] if x == header]
    if len(matches) > 1:
        print matches, "\n\n\n\n"
    if matches:
        return matches[0]


def partial_match(level, header, country):
Пример #11
0
def save_indicator():
    indicator = {"indID": INDID, "name": IND_NAME, "units": "count"}
    orm.Indicator(**indicator).save()
    for reg in regions:
        j = requests.get(baseindexurl % reg).json()
        for country in j['Countries']:
            yield {
                'region': country['Code'],
                'value':
                baseleafurl % (country['Code'], country['OfficialName'])
            }


print list(accuweather())

orm.DataSet(dsID="accuweather",
            last_updated=None,
            last_scraped=orm.now(),
            name="Accuweather").save()

orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save()

valuetemplate = {
    'dsID': 'accuweather',
    'indID': 'accuweather_url',
    'period': None,
    'source': 'http://www.accuweather.com'
}

for datarow in accuweather():
    olap_row = dict(valuetemplate)
    olap_row.update(datarow)
    orm.Value(**olap_row).save()
Пример #13
0
}

indicators = [{
    'indID': 'm49-name',
    'name': 'm49-name',
    'units': 'string'
}, {
    'indID': 'm49-num',
    'name': 'm49-num',
    'units': 'string'
}]

orm.DataSet(**dataset).save()

for x in indicators:
    orm.Indicator(**x).save()

mt = messytables.HTMLTableSet(fh)
mt_prune = [x for x in mt.tables if len(list(x)) > 200]
assert len(mt_prune) == 1

gb = False
for messy in mt_prune:
    table = xypath.Table.from_messy(messy)
    alpha_code_header = table.filter(
        contains_string("ISO ALPHA-3")).assert_one()
    country_header = table.filter(contains_string("or area name")).assert_one()
    num_code_header = table.filter(contains_string("Numerical")).assert_one()
    countries = country_header.fill(xypath.DOWN)
    num_code_header = countries.shift(x=-1)
    alpha_j = alpha_code_header.junction(countries)