def batch_read(query, full_url, max_rows=98360): """ To stay within the query limit for row numbers - this spit the query in multiple batches. """ dimensions= [len(q['selection']['values']) for q in query['query']] n_rows = functools.reduce(operator.mul, dimensions, 1) n_batches = math.ceil(n_rows / (max_rows * 0.95)) # Use 95% of the maximum value to be safe max_dim = max(dimensions) i_max = dimensions.index(max(dimensions)) batch_size = int(max_dim / n_batches) # Taking int round down the batch size to be sure we don't exceed the limit n_batches = math.ceil(max_dim / batch_size) # Recalculate the real number of batches required, because batch size has been round down print("The table has: ", n_rows, "rows in total.") results = pd.DataFrame() for b in range(n_batches): print("Doing query:", b + 1, "/", n_batches) min_range, max_range = b * batch_size, b * batch_size + batch_size query_ = copy.deepcopy(query) query_['query'][i_max]['selection']['values'] = query['query'][i_max]['selection']['values'][min_range:max_range] dimensions= [len(q['selection']['values']) for q in query_['query']] if dimensions[i_max] > 0: # Avoid empty queries data_ = requests.post(full_url, json = query_) dataj_ = data_.json(object_pairs_hook=OrderedDict) r = pyjstat.from_json_stat(dataj_)[0] r_ = pyjstat.from_json_stat(dataj_, naming='id')[0] r_.columns = ["_" + c for c in r_.columns] results_ = pd.concat([r, r_], axis=1) results = results.append(results_, ignore_index=True) return results
def test_from_to_json_stat_no_loads(self): """ Test pyjstat nested from-to json_stat using list of dicts as input """ results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) data_df = pyjstat.from_json_stat(json_data) line_thirty = ["unemployment rate", "Belgium", "2009", 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets["oecd"], "label") self.assertTrue(len(data_df) == 2) self.assertTrue(set(data_df[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(data_df[0].iloc[30].values) == set(line_thirty))
def test_from_to_json_stat_no_loads(self): """Test pyjstat nested from-to json_stat w list of dicts as input.""" results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) data_df = pyjstat.from_json_stat(json_data) line_thirty = ['unemployment rate', 'Belgium', '2009', 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets['oecd'], 'label') self.assertTrue(len(data_df) == 2) self.assertTrue( set(data_df[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(data_df[0].iloc[30].values) == set(line_thirty))
def test_to_json_stat_value(self): """ Test pyjstat to_json_stat() custom value column""" results = pyjstat.from_json_stat(self.sample_dataset, value='measure') json_data = json.loads(pyjstat.to_json_stat(results, value='measure'), object_pairs_hook=OrderedDict) self.assertTrue(json_data[0]["dataset1"]["measure"][0] == 4729)
def test_to_json_stat_types(self): """ Test pyjstat to_json_stat() output types""" results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["OECD countries,EU15 and total"]["category"]["index"] ["Spain"] == 28) self.assertTrue(type(json_data[0]["dataset1"]["dimension"] ["OECD countries,EU15 and total"]["category"]["index"] ["Spain"]) is int) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["OECD countries,EU15 and total"]["category"]["label"] ["France"] == "France") self.assertTrue(type(json_data[0]["dataset1"]["dimension"] ["OECD countries,EU15 and total"]["category"]["label"] ["France"]) is unicode) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["2003-2014"]["category"]["index"] ["2005"] == 2) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["2003-2014"]["category"]["label"] ["2005"] == "2005") self.assertTrue(type(json_data[0]["dataset1"]["dimension"] ["2003-2014"]["category"]["index"] ["2005"]) is int) self.assertTrue(type(json_data[0]["dataset1"]["dimension"] ["2003-2014"]["category"]["label"] ["2005"]) is unicode)
def read_box(self, from_box): """ Takes a widget container as input (where the user has selected varables) and returns a pandas dataframe with the values for the selected variables. Example ------- df = read_box(box) """ try: query = self.get_json(from_box) url = from_box.children[3].value data = requests.post(url, json=query) results = pyjstat.from_json_stat( data.json(object_pairs_hook=OrderedDict)) label = data.json(object_pairs_hook=OrderedDict) return [results[0], label['dataset']['label']] except TypeError: print('You must make choices in the box!') except: print('You must make choices in the box!')
def read_premade(premade_id = None, language = 'en', base_url = 'http://data.ssb.no/api/v0/dataset', full_url = None, table_format = 'json'): """ Returns a pandas dataframe of the premade table indicated by the premade table_id or the full_url. Note: The premade table id may be different from the normal table id. """ if full_url is None: full_url = '{base_url}/{premade_id}.{table_format}?lang={language}'.format( base_url = base_url, premade_id = str(premade_id), language = language, table_format = table_format) #print(full_url) if table_format == 'json': data = requests.get(full_url) df = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) df = df[0] elif table_format == 'csv': df = pd.read_csv(full_url) else: print("""Table_format is incorrectly specified. It must be 'json-stat' or 'csv'""") df = None return df
def post_query(): """ A function to do a post query on the SSB API. This function does a post query on the SSB API, following the SSB API Documentation, by doing a post request with the query we have built up, we get a JSON stat file back with the result. First we run meta_filter() once to get the filtered metadata variables, then for each dict in the list we run the build_query() function and post that query to the SSB API. Which after running that query returns a JSON-Stat file back with the results. We then run that JSON-Stat through pyjstat which converts and structures that file to a pandas DataFrame which gets appended to dataframes list. Once the for loop has finished we run a pandas concat on the dataframes list to convert to one single DF. Returns: -------- big_df : Series This is the DataFrame that will be returned to the SQL server we are using. """ dataframes = [] meta_data = meta_filter(calc_iterations()) for variables in meta_data: query = build_query(variables) data = requests.post(ssb_table.metadata_url, json=query) if data.status_code != 200: print("Feil! Status kode:", data.status_code) time.sleep(3.0) results = pyjstat.from_json_stat( data.json(object_pairs_hook=OrderedDict), naming="id") dataframes.append(results[0]) big_df = pd.concat(dataframes, ignore_index=True) return big_df
def test_convert_zeroes_not_null(self): """Test pyjstat to_json_stat zero conversion.""" results = pyjstat.from_json_stat(self.sweden_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(self.sweden_dataset['dataset']['value'][0] == json_data['dataset1']['value'][0])
def test_to_json_stat_types(self): """ Test pyjstat to_json_stat() output types""" results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["OECD countries, EU15 and total"]["category"]["index"] ["Spain"] == 28) self.assertTrue( type(json_data[0]["dataset1"]["dimension"] ["OECD countries, EU15 and total"]["category"]["index"] ["Spain"]) is int) self.assertTrue(json_data[0]["dataset1"]["dimension"] ["OECD countries, EU15 and total"]["category"]["label"] ["France"] == "France") self.assertTrue( type( str(json_data[0]["dataset1"]["dimension"] ["OECD countries, EU15 and total"]["category"]["label"] ["France"])) is str) self.assertTrue(json_data[0]["dataset1"]["dimension"]["2003-2014"] ["category"]["index"]["2005"] == 2) self.assertTrue(json_data[0]["dataset1"]["dimension"]["2003-2014"] ["category"]["label"]["2005"] == "2005") self.assertTrue( type(json_data[0]["dataset1"]["dimension"]["2003-2014"]["category"] ["index"]["2005"]) is int) self.assertTrue( type( str(json_data[0]["dataset1"]["dimension"]["2003-2014"] ["category"]["label"]["2005"])) is str)
def test_from_to_json_stat_as_dict(self): """ Test pyjstat nested from-to json_stat using dict of dicts as input """ results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) data_df = pyjstat.from_json_stat( json.loads(json.dumps(json_data), object_pairs_hook=OrderedDict)) line_thirty = ['unemployment rate', 'Belgium', '2009', 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets['oecd'], 'label') self.assertTrue(len(data_df) == 2) self.assertTrue(set(data_df[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(data_df[0].iloc[30].values) == set(line_thirty))
def test_convert_zeroes_not_null(self): """ Test pyjstat to_json_stat zero conversion""" results = pyjstat.from_json_stat(self.sweden_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(self.sweden_dataset['dataset']['value'][0] == json_data['dataset1']['value'][0])
def folkemengde(): url = 'http://data.ssb.no/api/v0/no/table/06913' payload = {"query": [{"code": "Region", "selection": {"filter": "item", "values": ["0"]}}, {"code": "ContentsCode", "selection": {"filter": "item", "values": ["Folkemengde"]}}, {"code": "Tid", "selection": {"filter": "all", "values": ["*"]}}], "response": {"format": "json-stat"}} data = requests.post(url, json = payload) result = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) frame = result[0] frame[u'år'] = pd.to_numeric(frame[u'år']) - 1 frame[u'folkemengde'] = pd.to_numeric(frame[u'value']) return frame
def test_ons_index_sort_bug_index(self): """Test from_json_stat dimension sorting indexes instead of labels.""" results = pyjstat.from_json_stat(self.ons_dataset, naming='id') json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue( self.ons_dataset['A02Level']['dimension']['CL_0000667']['category'] ['index']['CI_0018938'] == json_data['dataset1']['dimension'] ['CL_0000667']['category']['index']['CI_0018938'])
def test_uk_dataset(self): """ Test pyjstat using a different ONS dataset""" results = pyjstat.from_json_stat(self.uk_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output="dict"), object_pairs_hook=OrderedDict) self.assertTrue(len(results[0].columns) == 5) self.assertTrue(len(results[0].index) == 3) self.assertTrue(self.uk_dataset["QS104EW"]["value"]["0"] == json_data["dataset1"]["value"][0]) self.assertTrue(self.uk_dataset["QS104EW"]["value"]["2"] == json_data["dataset1"]["value"][2])
def test_from_json_stat_with_label(self): """ Test pyjstat from_json_stat() using label as parameter """ results = pyjstat.from_json_stat(self.oecd_datasets) line_thirty = ["unemployment rate", "Belgium", "2009", 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets["oecd"], "label") self.assertTrue(len(results) == 2) self.assertTrue(set(results[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(results[0].iloc[30].values) == set(line_thirty))
def test_ons_index_sort_bug(self): """ Test pyjstat from_json_stat dimension sorting""" results = pyjstat.from_json_stat(self.ons_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(self.ons_dataset['A02Level']['dimension']['CL_0000667'] ['category']['index']['CI_0018938'] == json_data['dataset1']['dimension']['Age']['category'] ['index']['16-17'])
def test_from_json_stat_with_id(self): """ Test pyjstat from_json_stat() using id as parameter""" results = pyjstat.from_json_stat(self.oecd_datasets, naming="id") line_thirty = [u"UNR", u"BE", u"2009", 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets["oecd"], "id") self.assertTrue(len(results) == 2) self.assertTrue(set(results[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(results[0].iloc[30].values) == set(line_thirty))
def test_from_json_stat_with_id(self): """Test pyjstat from_json_stat() using id as parameter.""" results = pyjstat.from_json_stat(self.oecd_datasets, naming='id') line_thirty = [u'UNR', u'BE', u'2009', 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets['oecd'], 'id') self.assertTrue(len(results) == 2) self.assertTrue( set(results[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(results[0].iloc[30].values) == set(line_thirty))
def kjorelengde(): url = 'http://data.ssb.no/api/v0/no/table/07301' payload = {"query": [{"code": "Kjoretoytype", "selection": {"filter": "item", "values": ["15",]}}, {"code": "ContentsCode", "selection": {"filter": "item", "values": ["Kjorekm"]}}, {"code": "Tid", "selection": {"filter": "all", "values": ["*" ]}}], "response": {"format": "json-stat"}} data = requests.post(url, json = payload) result = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) frame = result[0] frame[u'år'] = pd.to_numeric(frame[u'år']) frame[u'koyrelengde'] = pd.to_numeric(frame.value) return frame
def test_ons_index_sort_bug(self): """ Test pyjstat from_json_stat dimension sorting""" results = pyjstat.from_json_stat(self.ons_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue( self.ons_dataset['A02Level']['dimension']['CL_0000667']['category'] ['index']['CI_0018938'] == json_data['dataset1']['dimension'] ['Age']['category']['index']['16-17'])
def ndeaths(regvalues, causevalues, agevalues = allages(), sexvalues = ['1', '2'], yearvalues = yearrange()): """Send a JSON request to return number of deaths.""" qjson = mortreqjson(regvalues, causevalues, agevalues, sexvalues, yearvalues) req = requests.post(morturl, json = qjson) req.raise_for_status() respstr = req.content.decode('utf-8') respjson = json.loads(respstr, object_pairs_hook = OrderedDict) return {'dimension': respjson['dataset']['dimension'], 'frame': pyjstat.from_json_stat(respjson, naming = 'id')[0]}
def test_from_json_stat_with_label(self): """Test pyjstat from_json_stat() using label as parameter.""" results = pyjstat.from_json_stat(self.oecd_datasets) line_thirty = ['unemployment rate', 'Belgium', '2009', 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets['oecd'], 'label') self.assertTrue(len(results) == 2) self.assertTrue( set(results[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(results[0].iloc[30].values) == set(line_thirty))
def test_to_json_stat(self): """ Test pyjstat to_json_stat()""" results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) self.assertTrue(json_data[0]["dataset1"]["dimension"]["indicator"]["label"] == "indicator") self.assertTrue(json_data[0]["dataset1"]["dimension"]["size"][1] == 36) self.assertTrue(json_data[1]["dataset2"]["dimension"]["id"][2] == "age group") self.assertTrue(json_data[0]["dataset1"]["value"][-1], results[0][-1:]["value"]) results[0].columns = ["a", "a", "b", "value"] self.assertRaises(ValueError, pyjstat.to_json_stat, results)
def test_from_json_stat_with_id(self): """ Test pyjstat from_json_stat() using id as parameter""" results = pyjstat.from_json_stat(self.oecd_datasets, naming='id') line_thirty = ['UNR', 'BE', 2009, 7.891892855] dimensions = pyjstat.get_dimensions(self.oecd_datasets['oecd'], 'id') self.assertTrue(len(results) == 2) self.assertTrue(set(results[0].columns.values[:-1]) == set(dimensions[1])) self.assertTrue(set(results[0].iloc[30].values) == set(line_thirty))
def test_class_dataset(self): """ Test pyjstat using class dataset from v1.02""" results = pyjstat.from_json_stat(self.galicia_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output="dict"), object_pairs_hook=OrderedDict) self.assertTrue(self.galicia_dataset["class"] == "dataset") self.assertTrue(len(results[0].columns) == 7) self.assertTrue(len(results[0].index) == 3960) self.assertTrue(self.galicia_dataset["value"][0] == json_data["dataset1"]["value"][0]) self.assertTrue(self.galicia_dataset["value"][547] == json_data["dataset1"]["value"][547]) self.assertTrue(self.galicia_dataset["value"][-1] == json_data["dataset1"]["value"][-1])
def test_uk_dataset(self): """Test pyjstat using a different ONS dataset.""" results = pyjstat.from_json_stat(self.uk_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(len(results[0].columns) == 5) self.assertTrue(len(results[0].index) == 3) self.assertTrue(self.uk_dataset['QS104EW']['value']['0'] == json_data['dataset1']['value'][0]) self.assertTrue(self.uk_dataset['QS104EW']['value']['2'] == json_data['dataset1']['value'][2])
def test_us_labor_dataset(self): """ Test pyjstat using a us labor dataset of class dataset""" results = pyjstat.from_json_stat(self.uslabor_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output="dict"), object_pairs_hook=OrderedDict) self.assertTrue(self.uslabor_dataset["class"] == "dataset") self.assertTrue(len(results[0].columns) == 4) self.assertTrue(len(results[0].index) == 12880) self.assertTrue(self.uslabor_dataset["value"][0] == json_data["dataset1"]["value"][0]) self.assertTrue(self.uslabor_dataset["value"][547] == json_data["dataset1"]["value"][547]) self.assertTrue(self.uslabor_dataset["value"][-1] == json_data["dataset1"]["value"][-1])
def read_query(queries): dataframes = [] for i in queries: data = requests.post(a.url, json=i) results = pyjstat.from_json_stat( data.json(object_pairs_hook=OrderedDict), naming="id") dataframes.append(results[0]) if len(queries) > 1: big_df = pd.concat(dataframes, ignore_index=True) return big_df else: return dataframes[0]
def test_uk_dataset(self): """ Test pyjstat using a different ONS dataset""" results = pyjstat.from_json_stat(self.uk_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(len(results[0].columns) == 5) self.assertTrue(len(results[0].index) == 3) self.assertTrue(self.uk_dataset['QS104EW']['value']['0'] == json_data['dataset1']['value'][0]) self.assertTrue(self.uk_dataset['QS104EW']['value']['2'] == json_data['dataset1']['value'][2])
def npop(regvalues, agevalues = allages('pop'), sexvalues = ['1', '2'], yearvalues = yearrange()): """Send a JSON request to return population size.""" qjson = popreqjson(regvalues, agevalues, sexvalues, yearvalues) req = requests.post(popurl, json = qjson) req.raise_for_status() respstr = req.content.decode('utf-8') respjson = json.loads(respstr, object_pairs_hook = OrderedDict) popframe = pyjstat.from_json_stat(respjson, naming = 'id')[0] popmerged = pd.merge(ageintmerge(), popframe, on = 'Alder') return {'dimension': respjson['dataset']['dimension'], 'frame': popmerged}
def _get_table(self, url, table_format='json'): if table_format == 'json': response = requests.get(url) df = pyjstat.from_json_stat( response.json(object_pairs_hook=OrderedDict))[0] elif table_format == 'csv': df = pd.read_csv(url) else: print("""table_format param must be either 'json' or 'csv'""") df = None return df
def read_all(table_id = None, language = 'en', base_url = 'http://data.ssb.no/api/v0', full_url = None, max_rows = 98360): """ Returns a pandas dataframe with all values for all options for the table specified by table_id Warning: The table may be large Useful if - you know exactly what you are looking for and - you do not want to use the notebook/widgets/box to specify the json query) Example df = read_all(table_id = '10714') """ if full_url is None: full_url = '{base_url}/{language}/table/{table_id}'.format( base_url = base_url, language = language, table_id = table_id) print("Requesting: ", full_url) query = full_json(full_url = full_url) try: # Query limit is currently of 800,000 rows - if this fails then split the query data = requests.post(full_url, json = query) dataj = data.json(object_pairs_hook=OrderedDict) r = pyjstat.from_json_stat(dataj)[0] r_ = pyjstat.from_json_stat(dataj, naming='id')[0] r_.columns = ["_" + c for c in r_.columns] results = pd.concat([r, r_], axis=1) except: print("Simple query failed: Trying to split the query...") results = batch_read(query, full_url, max_rows=max_rows) return results
def npop(regvalues, agevalues=allages('pop'), sexvalues=['1', '2'], yearvalues=yearrange()): """Send a JSON request to return population size.""" qjson = popreqjson(regvalues, agevalues, sexvalues, yearvalues) req = requests.post(popurl, json=qjson) req.raise_for_status() respstr = req.content.decode('utf-8') respjson = json.loads(respstr, object_pairs_hook=OrderedDict) popframe = pyjstat.from_json_stat(respjson, naming='id')[0] popmerged = pd.merge(ageintmerge(), popframe, on='Alder') return {'dimension': respjson['dataset']['dimension'], 'frame': popmerged}
def test_class_dataset(self): """Test pyjstat using class dataset from v1.02.""" results = pyjstat.from_json_stat(self.galicia_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(self.galicia_dataset['class'] == 'dataset') self.assertTrue(len(results[0].columns) == 7) self.assertTrue(len(results[0].index) == 3960) self.assertTrue(self.galicia_dataset['value'][0] == json_data['dataset1']['value'][0]) self.assertTrue(self.galicia_dataset['value'][547] == json_data['dataset1']['value'][547]) self.assertTrue(self.galicia_dataset['value'][-1] == json_data['dataset1']['value'][-1])
def test_us_labor_dataset(self): """Test pyjstat using a us labor dataset of class dataset.""" results = pyjstat.from_json_stat(self.uslabor_dataset) json_data = json.loads(pyjstat.to_json_stat(results, output='dict'), object_pairs_hook=OrderedDict) self.assertTrue(self.uslabor_dataset['class'] == 'dataset') self.assertTrue(len(results[0].columns) == 4) self.assertTrue(len(results[0].index) == 12880) self.assertTrue(self.uslabor_dataset['value'][0] == json_data['dataset1']['value'][0]) self.assertTrue(self.uslabor_dataset['value'][547] == json_data['dataset1']['value'][547]) self.assertTrue(self.uslabor_dataset['value'][-1] == json_data['dataset1']['value'][-1])
def test_to_json_stat(self): """Test pyjstat to_json_stat().""" results = pyjstat.from_json_stat(self.oecd_datasets) json_data = json.loads(pyjstat.to_json_stat(results), object_pairs_hook=OrderedDict) self.assertTrue(json_data[0]["dataset1"]["dimension"]["indicator"] ["label"] == "indicator") self.assertTrue(json_data[0]["dataset1"]["dimension"]["size"][1] == 36) self.assertTrue( json_data[1]["dataset2"]["dimension"]["id"][2] == "age group") self.assertTrue(json_data[0]["dataset1"]["value"][-1], results[0][-1:]['value']) results[0].columns = ['a', 'a', 'b', 'value'] self.assertRaises(ValueError, pyjstat.to_json_stat, results)
def get_pandas_df(self, url, params=None, table_format='json'): """ Get Pandas dataframe """ self.log(str(url)) if params == None: params = json.dumps(self._full_json(url)) response = requests.post(url, params).content response = response.decode('utf-8') df = pyjstat.from_json_stat(json.loads(response))[0] return df
def sverige_folkemengde(): url = 'http://api.scb.se/OV0104/v1/doris/en/ssd/BE/BE0101/BE0101A/BefolkningNy' payload = {"query": [ {"code": "ContentsCode", "selection": {"filter": "item", "values": ["BE0101N1"]}}, {"code": "Region", "selection": {"filter": "item", "values": ["00"]}}, {"code": "Tid", "selection": {"filter": "all", "values": ["*"]}}], "response": {"format": "json-stat"}} data = requests.post(url, json = payload) result = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) frame = result[0] frame[u'år'] = pd.to_numeric(frame[u'year']) frame[u'folkemengde_sverige'] = pd.to_numeric(frame[u'value']) return frame
def ndeaths(regvalues, causevalues, agevalues=allages(), sexvalues=['1', '2'], yearvalues=yearrange()): """Send a JSON request to return number of deaths.""" qjson = mortreqjson(regvalues, causevalues, agevalues, sexvalues, yearvalues) req = requests.post(morturl, json=qjson) req.raise_for_status() respstr = req.content.decode('utf-8') respjson = json.loads(respstr, object_pairs_hook=OrderedDict) return { 'dimension': respjson['dataset']['dimension'], 'frame': pyjstat.from_json_stat(respjson, naming='id')[0] }
def read_box(from_box): """ Takes a widget container as input (where the user has selected varables) and returns a pandas dataframe with the values for the selected variables. Example ------- df = read_box(box) """ query = get_json(from_box) url = from_box.children[3].value data = requests.post(url, json = query) results = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) return results[0]
def read_with_json(table_id = None, query = None, language = 'en', base_url = 'http://data.ssb.no/api/v0', full_url = None): """ Returns a pandas dataframe with the values for the table specified by table_id and an explicit json string (in json-stat format). Useful if - you know exactly what you are looking for and - can specify the json yourself (as a dictionary) - you do not want to use the notebook/widgets/box to specify the json query Hints ----- - use full_json(table_id = '10714', out = 'string') to get a query string and edit it - use to_dict(str) to get a dict from an edited json string Example ------- json_query = {'response': {'format': 'json-stat'}, 'query': [ {'selection': {'values': ['0'], 'filter': 'item'}, 'code': 'Region'}, {'selection': {'values': ['KufjolsIAlt'], 'filter': 'item'}, 'code': 'ContentsCode'}, {'selection': {'values': ['1999', '2013'], 'filter': 'item'}, 'code': 'Tid'}]} } df = read_with_json(table_id = '10714', query = json_query) """ if full_url is None: full_url = '{base_url}/{language}/table/{table_id}'.format( base_url = base_url, language = language, table_id = table_id) data = requests.post(full_url, json = query) results = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) return results[0]
def read_url(full_url = None, table_format = 'json'): """ Returns a pandas dataframe of the premade table indicated by the premade table_id or the full_url. Note: The premade table id may be different from the normal table id. """ if table_format == 'json': data = requests.get(full_url) df = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) df = df[0] elif table_format == 'csv': df = pd.read_csv(full_url) else: print("""Table_format is incorrectly specified. It must be 'json-stat' or 'csv'""") df = None return df
def read_all(table_id = None, language = 'en', base_url = 'http://data.ssb.no/api/v0', full_url = None): """ Returns a pandas dataframe with all values for all options for the table specified by table_id Warning: The table may be large Useful if - you know exactly what you are looking for and - you do not want to use the notebook/widgets/box to specify the json query) Example df = read_all(table_id = '10714') """ if full_url is None: full_url = '{base_url}/{language}/table/{table_id}'.format( base_url = base_url, language = language, table_id = table_id) query = full_json(full_url = full_url) data = requests.post(full_url, json = query) results = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) # maybe this need not be its own function, # but an option in read_json? json = 'all' # other functions(options include: read_recent to get only the # most recent values (defined as x), json = 'recent') return results[0]
# -*- coding: utf-8 -*- """ pyjstat example with 0.3.5-like syntax for JSON-stat 1.3.""" from pyjstat import pyjstat import requests from collections import OrderedDict import json EXAMPLE_URL = 'http://json-stat.org/samples/us-labor-ds.json' data = requests.get(EXAMPLE_URL) results = pyjstat.from_json_stat(data.json(object_pairs_hook=OrderedDict)) print (results) print (json.dumps(json.loads(pyjstat.to_json_stat(results))))
def test_from_json_stat_no_coertion(self): """ Test pyjstat from_json_stat with id naming without coertion""" results = pyjstat.from_json_stat(self.sweden_dataset, naming="id") self.assertTrue(results[0]["Alder"][500] == "35-39")
def run_pyjstat(result_list): return pyjstat.from_json_stat( result_list.json(object_pairs_hook=OrderedDict), naming="id")[0]
import requests import json from pyjstat import pyjstat from urllib.request import urlopen import matplotlib.pyplot as plt import pandas as pd pd.set_option('display.max_columns', 10) pd.set_option('display.width',2000) pd.set_option('display.max_rows', 200) ''' pyjstat.from_json_stat(datasets, naming='label', value='value') Decode JSON-stat formatted data into pandas.DataFrame object. Parameters: datasets (OrderedDict, list) – data in JSON-stat format, previously deserialized to a python object by json.load() or json.loads(), for example. Both List and OrderedDict are accepted as inputs. naming (string, optional) – dimension naming. Possible values: ‘label’ or ‘id’.Defaults to ‘label’. value (string, optional) – name of the value column. Defaults to ‘value’. Returns: results – list of pandas.DataFrame with imported data. Return type: list ''' # mainstream class numbers, mainstream pupils, average class size. # https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/ED114/JSON-stat/1.0/ url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/ED112/JSON-stat/1.0/" results = pyjstat.from_json_stat(json.load(urlopen(url))) # print(results) - List with 1 element data = results[0] # a PANDAS DATAFRAME! Hurah. print(data.head()) summary = data.groupby("Statistic")["value"].sum() print(summary)
# -*- coding: utf-8 -*- from pyjstat import pyjstat import urllib2 import json from collections import OrderedDict data = json.load(urllib2.urlopen( 'http://json-stat.org/samples/oecd-canada.json'), object_pairs_hook=OrderedDict) results = pyjstat.from_json_stat(data) print results
from pyjstat import pyjstat from collections import OrderedDict import urllib2 import json dataset_url_1 = 'http://www.cso.ie/StatbankServices/StatbankServices.svc/jsonservice/responseinstance/CDD01' population_json_data = json.load(urllib2.urlopen(dataset_url_1), object_pairs_hook=OrderedDict) population_results = pyjstat.from_json_stat(population_json_data, naming="id") population_dataset = population_results[0] population_data = population_dataset[population_dataset['ContentsCode'] == 'Folketallet11'] population_data.head()
def test_from_json_stat_no_coertion(self): """ Test pyjstat from_json_stat with id naming without coertion""" results = pyjstat.from_json_stat(self.sweden_dataset, naming='id') self.assertTrue(results[0]['Alder'][500] == '35-39')