def get_data_co2_temp(): data_url = 'https://datahub.io/core/global-temp/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources data_temperature = [] for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) data_temperature.append(data) data_temperature_annual = data_temperature[0] data_temperature_annual = data_temperature_annual[ data_temperature_annual['Year'] >= 1980] data_temperature_annual = data_temperature_annual.sort_values(by=['Year']) data_temperature_annual = data_temperature_annual[ data_temperature_annual['Source'] == 'GISTEMP'] #get data co2 data_url = 'https://datahub.io/core/co2-ppm/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources data_co2 = [] for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) data_co2.append(data) data_co2_annual = data_co2[-2] data_co2_annual = data_co2_annual[data_co2_annual['Year'] < 2017] data_annual_gistemp = data_co2_annual.rename(columns={'Mean': 'CO2'}) data_annual_gistemp = pd.merge( data_annual_gistemp, data_temperature_annual, on='Year').rename(columns={'Mean': 'Temperature'}) data_annual_gistemp['Group'] = 0 i = 1 for year in range(1980, 2016, 6): if year != 2010: data_annual_gistemp.loc[(data_annual_gistemp['Year'] >= year) & (data_annual_gistemp['Year'] <= (year + 5)), 'Group'] = str(year) + '-' + str(year + 5) else: data_annual_gistemp.loc[(data_annual_gistemp['Year'] >= year) & (data_annual_gistemp['Year'] <= (year + 6)), 'Group'] = str(year) + '-' + str(year + 6) i += 1 return data_annual_gistemp
def test_uk_steel_delivered_to_uk_is_consistent(): """Two tables provide this: (A) the difference between UK production and exports in Tables 16 and 17; and (B) Table 18 "supply to UK market". This test checks they match. """ # Load data p = datapackage.Package(DATAPACKAGE) dfs = {r.name: pd.DataFrame(r.read(keyed=True)) for r in p.resources} # Version A: production - exports prod1 = dfs['production_ecsc'].set_index(['year', 'product'])['mass'] prod2 = dfs['production_derived'].set_index(['year', 'product'])['mass'] exports = dfs['exports'].set_index(['year', 'product'])['mass'] prod = pd.concat([prod1, prod2]) A = (prod - exports).loc[2016] # Version B: supply supply = dfs['supply'].groupby(['product']).sum() B = supply[['uk_production_to_stockholders', 'uk_production_to_industry']] \ .sum(axis='columns') # Compare df = pd.concat({'A': A, 'B': B}, axis=1) df['diff'] = df['A'].astype(float) - df['B'] problems = (abs(df['diff']) > 0.5) | (pd.isnull(df['diff'])) assert not any(problems), \ 'Differences found:\n\n%s\n' % df[problems]
def __init__(self, descriptor_file): self._datapackage = datapackage.Package(descriptor_file) self.__descriptor_file = descriptor_file self.__base_path = os.path.dirname( os.path.abspath(self.__descriptor_file)) # Index resources by name self.__resources = {r.descriptor['name']: r for r in self._datapackage.resources} self.__tabular_resources = {k: self._sanitize_resource(r) for (k, r) in self.__resources.items() if r.tabular and r.descriptor['path'].startswith('data')} self.__invalid_schemas = [] # Resource names with invalid schemas # All formats self.raw_data = LazyLoadedDict.from_keys( self.__resources.keys(), self._load_raw_data, 'bytes') # Tabular formats self.tables = LazyLoadedDict.from_keys( self.__tabular_resources.keys(), self._load_table, type_hint='list of rows') self.dataframes = LazyLoadedDict.from_keys( self.__tabular_resources.keys(), self._load_dataframe, type_hint='pandas.DataFrame')
def load_dataframe(filename, resource): """Load one table from a datapackage.""" package = datapackage.Package(filename) r = package.get_resource(resource) if r is None: raise KeyError('No resource found: %s' % resource) return pd.DataFrame(r.read(), columns=r.headers)
def test_it(client, doi): jsr = client.remote_datapackage_json(doi) dp = datapackage.Package(jsr) if not dp.valid: raise AssertionError( f"Invalid datapackage.json found for {doi} " f"({jsr['name']}).")
def getdata_old(data_url='https://datahub.io/core/covid-19/datapackage.json', resourcename='countries-aggregated_csv'): """ Get data from the web. datahub is outdated. Rewrote code to read directly from CSSE. Parameters: =========== data_url : string with url of the data from datahub resourcename : resource to use Output: ======= pd.DataFrame with the epidemic statistics data """ # to load Data Package into storage package = datapackage.Package(data_url) resources = package.resources for resource in resources: if resource.name == resourcename: url = resource.descriptor['path'] print('Importing', url) s = requests.get(url).text data = pd.read_csv(StringIO(s)) return data
def extract_airports_data(source, target): """ Description: This function is to extract airports codes data file in csv format and load into s3 data lakes in parquet format using pandas. Arguments: source: location for source json file target: location for output parquet file Returns: None """ print("INFO: Extracting and loading airports data") # to load Data Package into storage package = datapackage.Package(source) # to load only tabular data resources = package.resources for resource in resources: if resource.descriptor['datahub']['type'] == 'derived/csv': parquet_file_name = resource.name.split('_')[0] + ".parquet" parquet_file_path = os.path.join(target, parquet_file_name) fs = s3fs.S3FileSystem(anon=False) if fs.exists(parquet_file_path): print("INFO: {} already processed".format(parquet_file_name)) else: df = pd.read_csv(resource.descriptor['path']) df.to_parquet(parquet_file_path)
def validate_save_pkg(pkg_descriptor, pkg_dir): """ Validate a data package descriptor and save it to a json file. Args: pkg_descriptor (dict): pkg_dir (path-like): Returns: report """ # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to if not data_pkg.valid: logger.error(f""" Invalid tabular data package: {data_pkg.descriptor["name"]} Errors: {data_pkg.errors}""") # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") data_pkg.save(pkg_json) logger.info('Validating the data package...') # Validate the data within the package using goodtables: report = goodtables.validate(pkg_json, row_limit=1000) if not report['valid']: logger.error("Data package validation failed.") else: logger.info('Congrats! You made a valid data package!') return report
def prepare(self, stream, schema, extra): # Prepare package if 'datapackage' not in extra or 'resource-name' not in extra: return False descriptor = extra['datapackage'] if descriptor.strip().startswith('{'): descriptor = json.loads(descriptor) self.__package = datapackage.Package(descriptor) # Prepare schema if not schema: return False if not schema.foreign_keys: return False self.__schema = schema # Prepare foreign keys values try: self.__relations = _get_relations( self.__package, self.__schema, current_resource_name=extra['resource-name']) self.__foreign_keys_values = _get_foreign_keys_values( self.__schema, self.__relations) self.__relations_exception = None except _ReferenceTableError as exception: self.__relations_exception = exception return True
def generate_metadata(pkg_settings, tables, pkg_dir, uuid_pkgs=uuid.uuid4()): # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") # Create a tabular data resource for each of the tables. resources = [] for t in tables: resources.append(get_tabular_data_resource_2(t, pkg_dir=pkg_dir)) data_sources = pudl.helpers.data_sources_from_tables_pkg(tables) sources = [] for src in data_sources: if src in pudl.constants.data_sources: sources.append({"title": src, "path": pc.base_data_urls[src]}) contributors = set() for src in data_sources: for c in pudl.constants.contributors_by_source[src]: contributors.add(c) pkg_descriptor = { "name": pkg_settings["name"], "profile": "tabular-data-package", "title": pkg_settings["title"], "id": uuid_pkgs, "description": pkg_settings["description"], # "keywords": pkg_settings["keywords"], "homepage": "https://catalyst.coop/pudl/", "created": (datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'), "contributors": [pudl.constants.contributors[c] for c in contributors], "sources": sources, "licenses": [pudl.constants.licenses["cc-by-4.0"]], "resources": resources, } # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to if not data_pkg.valid: logger.warning(f""" Invalid tabular data package: {data_pkg.descriptor["name"]} Errors: {data_pkg.errors}""") data_pkg.save(pkg_json) # Validate the data within the package using goodtables: report = goodtables.validate(pkg_json, row_limit=1000) if not report['valid']: logger.warning("Data package data validation failed.") return data_pkg, report
def _validate_datapackage(self, datapackage_json: dict): """Checks the correctness of datapackage.json metadata. Throws ValueError if invalid.""" dp = datapackage.Package(datapackage_json) if not dp.valid: msg = f"Found {len(dp.errors)} datapackage validation errors:\n" for e in dp.errors: msg = msg + f" * {e}\n" raise ValueError(msg)
def __new__(cls): data_url = 'https://datahub.io/core/s-and-p-500-companies-financials/datapackage.json' package = datapackage.Package(data_url) resources = package.resources for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) return data
def load_datapackage_tables(filename): """Load all the tables from a datapackage.""" package = datapackage.Package(filename) tables = { r.name: pd.DataFrame(r.read(), columns=r.headers) for r in package.resources } return {k: df.set_index(['year', 'product']) for k, df in tables.items()}
def get_dataset(): data_url = 'https://datahub.io/machine-learning/iris/datapackage.json' package = datapackage.Package(data_url) resources = package.resources for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) return data
def downloadData(url: str, descriptor: str): """ Returns first corrence of provided descriptor from provided url as a file handler """ assets = datapackage.Package(url).resources for data in filter( lambda x: x.tabular and x.descriptor['name'] == descriptor, assets): response = requests.get(data.descriptor['path']) return io.StringIO(response.content.decode('utf-8'))
def test_cli_init(): resource_path = 'data/valid.csv' result = CliRunner().invoke(init, [resource_path]) assert result.exit_code == 0 dp = datapackage.Package(json.loads(result.output), strict=True) resource = dp.resources[0] assert resource.descriptor['path'] == resource_path assert 'schema' in resource.descriptor
def get_s_and_p_symbols(self): package = dp.Package(self.url) # print list of all resources: print(package.resource_names) storage_dict = self.convert_to_dfs(package) df = storage_dict['constituents'] return df['Symbol'].drop_duplicates()
def get(market): data_url = 'https://datahub.io/core/' + market + '/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) return data
def __load_data(self, csv_source): start = time.time() if 'http' in csv_source: import datapackage package = datapackage.Package(csv_source) resources = package.resources for resource in resources: if resource.tabular: return pd.read_csv(resource.descriptor['path']) else: return pd.read_csv(csv_source) end = time.time() print("Loaded data in {} seconds".format(end - start))
def meta_omniglot(data_root_folder=None, std_num_classes=None, std_num_examples=None, one_hot_enc=True, rand=0, n_splits=None): """ Loads, and downloads if necessary, Omniglot meta-dataset """ data_folder_name = 'omniglot_resized' if em is None: return experiment_manager_not_available('meta_omniglot NOT AVAILABLE!') if data_root_folder is None: data_root_folder = os.path.join(os.getcwd(), 'DATA') if not os.path.exists(data_root_folder): os.mkdir(data_root_folder) data_folder = os.path.join(data_root_folder, data_folder_name) if os.path.exists(data_folder): print('DATA FOLDER IS:', data_folder) print('LOADING META-DATASET') return em.load.meta_omniglot(data_folder, std_num_classes=std_num_classes, std_num_examples=std_num_examples, one_hot_enc=one_hot_enc, _rand=rand, n_splits=n_splits) else: print('DOWNLOADING DATA') package = datapackage.Package( 'https://datahub.io/lucfra/omniglot_resized/datapackage.json') with open('tmp_omniglot_resized.zip', 'wb') as f: f.write(package.get_resource('omniglot_resized').raw_read()) import zipfile zip_ref = zipfile.ZipFile('tmp_omniglot_resized.zip', 'r') print('EXTRACTING DATA') zip_ref.extractall(data_root_folder) zip_ref.close() os.remove('tmp_omniglot_resized.zip') print('DONE') # os.tmpfile() return meta_omniglot(data_root_folder, std_num_classes, std_num_examples, one_hot_enc, rand, n_splits)
def extract_tickers(): data_url = 'https://datahub.io/core/nasdaq-listings/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources for resource in resources: if resource.tabular: NASDAQ = pd.read_csv(resource.descriptor['path']) NASDAQ.to_csv('../data/NASDAQ_Update.csv',index=False)
def validate_save_pkg(pkg_descriptor, pkg_dir): """ Validate a data package descriptor and save it to a json file. Args: pkg_descriptor (dict): pkg_dir (path-like): Returns: report """ # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to logger.info( f"Validating JSON descriptor for {data_pkg.descriptor['name']} " f"tabular data package...") if not data_pkg.valid: raise ValueError( f"Invalid tabular data package: {data_pkg.descriptor['name']} " f"Errors: {data_pkg.errors}") logger.info('JSON descriptor appears valid!') # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") data_pkg.save(pkg_json) logger.info( f"Validating a sample of data from {data_pkg.descriptor['name']} " f"tabular data package using goodtables...") # Validate the data within the package using goodtables: report = goodtables.validate( pkg_json, # TODO: check which checks are applied... and uncomment out the line # below when the checks are integrated # checks=['structure', 'schema', 'foreign-key'], row_limit=1000) if not report["valid"]: goodtables_errors = "" for table in report["tables"]: if not table["valid"]: goodtables_errors += str(table["source"]) goodtables_errors += str(table["errors"]) raise ValueError( f"Data package data validation failed with goodtables. " f"Errors: {goodtables_errors}") logger.info('Congrats! You made a valid data package!') return report
def read_gold_prices(): data_url = 'https://datahub.io/core/gold-prices/datapackage.json' package = datapackage.Package(data_url) resources = package.resources data = None for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) #print(data.head()) date_field = pd.to_datetime(data['Date'].astype(str), format='%Y-%m') data['Date'] = date_field #print(data) for index, row in data.iterrows(): GoldPrice.objects.create(date=row['Date'], price=row['Price'])
def __init__(self, descriptor=None, title=None, **kwargs): """ You should not use the constructor directly but use the infer_from_file static method. :param descriptor: An initial json data package descriptor. If none it will generate a blank :param title: if a title is given it will be set in the descriptor along with the required name. :param kwargs: kwargs that can de passed to the underlying frictionless datapackage.Package """ descriptor = descriptor or {} if title: descriptor['title'] = title descriptor['name'] = slugify(title) self.package = datapackage.Package(descriptor, **kwargs) self.biosys_errors = [] # set the dataset type to be generic. self.dataset_type = Dataset.TYPE_GENERIC
def __load_data(self, csv_source): if 'http' in csv_source: import datapackage while 1: try: package = datapackage.Package(csv_source) resources = package.resources for resource in resources: if resource.tabular: return pd.read_csv(resource.descriptor['path']) except: print( "Failed to load Data from {}. WIll reload. Tracebac: {}" .format(csv_source, traceback.format_exc())) else: return pd.read_csv(csv_source)
def create_custom(base_fp, agreement_type="both"): # Set "both" agreement type if agreement_type == "both": agreement_type = ["plain", "explicit"] else: agreement_type = [agreement_type] # TODO: Add a plain/explicit question to the front of GUI and to this... # ... this will choose the questions to ask and format to print base_data = json.load(base_fp) # TODO: The code exists to parse the correct datapackage data from the decisions list # But, the current Jinja templates just take directly from the datapackage # Eventually we want to just use the decisions template. # Append Datapackages to base_data try: pkg = datapackage.Package(get_datapackage_path()) for name in pkg.resource_names: log.debug("loading datapackage {0}".format(name)) resource = pkg.get_resource(name) base_data[name] = resource.table.read(keyed=True) except datapackage.exceptions.CastError as _e: log.debug("Error while attempting to read datapackage resource") log.error(_e.errors) raise _e if "plain" in agreement_type: # Create plain res = render("templates/plain.j2", base_data) md_path = "outputs/plain_custom.md" compose_agreement(md_path, res) pdf_path = "outputs/plain_custom.pdf" pandoc_command = [ "pandoc", "-V", "geometry:margin=1in", "-f", "markdown_github", "-t", "latex", "-o", pdf_path, md_path ] subprocess.check_call(pandoc_command) if "explicit" in agreement_type: # Create Explicit res = render("templates/explicit.j2", base_data) md_path = "outputs/explicit_custom.md" compose_agreement(md_path, res) pdf_path = "outputs/explicit_custom.pdf" pandoc_command = [ "pandoc", "-V", "geometry:margin=1in", "-f", "markdown_github", "-t", "latex", "-o", pdf_path, md_path ] subprocess.check_call(pandoc_command) print("DONE")
def cases_per_capita(df, df3): data_url = "https://datahub.io/JohnSnowLabs/population-figures-by-country/datapackage.json" package = datapackage.Package(data_url) # to load only tabular data resources = package.resources for resource in resources: if resource.tabular: df2 = pd.read_csv(resource.descriptor['path']) df3 = new_cases_modified(df) dct2 = {} list_of_countries_sorted = df3['Country/Region'].tolist() new_cases_per_sorted_country = df3.iloc[:, -1].tolist() dct = {} result_dict = {} for i in list_of_countries_sorted: value = df2.loc[df2['Country'] == str(i)]['Year_2016'].tolist() for j in value: dct[str(i)] = j for key, value in dct.items(): if key not in list_of_countries_sorted: print(key) for (country, value) in zip(list_of_countries_sorted, new_cases_per_sorted_country): dct2[str(country)] = value country_list = [] cases_Capita = [] for keys in dct.keys(): country_list.append(keys) for (key1, value1), (key2, value2) in zip(dct.items(), dct2.items()): per_capita = value2 / value1 cases_Capita.append(per_capita) for (i, j) in zip(country_list, cases_Capita): result_dict[i] = str(j) return result_dict
def objeto_del_gasto(config): CT = COLUMN_MAPPING CN = dict((k, v.replace(':', '-')) for k, v in CT.items()) lookup = {} codes = datapackage.Package( os.path.join(os.path.dirname(__file__), 'objeto_del_gasto.datapackage.zip')) for resource in codes.resources: kind = resource.name lookup[kind] = {} for row in resource.iter(keyed=True): key = row[kind.upper().replace('Í', 'I')] value = row['DESCRIPCION'] lookup[kind][key] = value def process(row): year = int(row['date-fiscal-year']) # Skip the LAST year of the dataset (currently 2016) it has split columns already if year < 2018: objeto = row[CN['ID_CONCEPTO']] if objeto: row[CN['ID_CAPITULO']] = objeto[0] + '000' row[CN['ID_CONCEPTO']] = objeto[:2] + '00' row[CN['DESC_CAPITULO']] = lookup['capitulo'].get( row[CN['ID_CAPITULO']]) row[CN['DESC_CONCEPTO']] = lookup['concepto'].get( row[CN['ID_CONCEPTO']]) nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3 if objeto and len(objeto) >= 4: row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits] row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get( row.get(CN['ID_PARTIDA_GENERICA'])) if year not in (2008, 2009, 2010): if objeto and len(objeto) >= 5: row[CN['ID_PARTIDA_ESPECIFICA']] = objeto row[CN['DESC_PARTIDA_ESPECIFICA']] = \ lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA'])) return process
def get_corr_assets(dataset): data_url = 'https://datahub.io/core/finance-vix/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources for resource in resources: if resource.tabular: data = pd.read_csv(resource.descriptor['path']) break data['Date'] = pd.to_datetime(data.Date, format='%Y-%m-%d') start = data[data['Date'] == dataset['Date'][0]].index[0] # Create VIX Close and Open dataset['vixClose'] = data['VIX Close'][start:].reset_index().drop('index', axis=1) dataset['vixOpen'] = data['VIX Open'][start:].reset_index().drop('index', axis=1) return dataset
def init_datapackage(resource_paths): """Create tabular data package with resources. It will also infer the tabular resources' schemas. Args: resource_paths (List[str]): Paths to the data package resources. Returns: datapackage.Package: The data package. """ dp = datapackage.Package({ 'name': 'change-me', 'schema': 'tabular-data-package', }) for path in resource_paths: dp.infer(path) return dp