def test_create_datapackage_1(): from ddf_utils.model.package import DDFcsv from ddf_utils.package import get_datapackage dataset_path = os.path.join(os.path.dirname(__file__), 'chef/datasets/ddf--gapminder--dummy_companies') dp1 = get_datapackage(dataset_path, use_existing=True) assert dp1['license'] == 'foo' dp2 = get_datapackage(dataset_path, use_existing=False) for k in ['name', 'title', 'author', 'description', 'language', 'license']: assert k in dp2.keys() dp_ = DDFcsv.from_path(dataset_path) dp_.get_ddf_schema(update=True) datapackage = dp_.to_dict() assert 'ddfSchema' in datapackage.keys() d = { "primaryKey": [ "project" ], "value": None, "resources": [ "ddf--entities--project" ] } assert d in datapackage['ddfSchema']['entities']
def main(): print('running etl...') data = pd.read_excel(open_google_spreadsheet(DOCID), sheet_name=SHEET) measures = list() for c, df in gen_datapoints(data): c_id = COLUMN_TO_CONCEPT[c] df.columns = [c_id] serve_datapoint(df, OUT_DIR, c_id) measures.append((c_id, c)) measures_df = pd.DataFrame(measures, columns=['concept', 'name']) measures_df['concept_type'] = 'measure' dimensions_df = pd.DataFrame.from_dict( dict(concept=DIMENSIONS, name=list(map(str.title, DIMENSIONS)), concept_type=['entity_domain', 'time'])) others_df = pd.DataFrame.from_dict( dict(concept=['name'], name=['name'], concept_type=['string'])) (pd.concat([measures_df, dimensions_df, others_df], ignore_index=True).to_csv(osp.join(OUT_DIR, 'ddf--concepts.csv'), index=False)) geo_df = create_geo_domain(data) geo_df.to_csv(osp.join(OUT_DIR, 'ddf--entities--geo.csv'), index=False) # datapackage dump_json(osp.join(OUT_DIR, 'datapackage.json'), get_datapackage(OUT_DIR, update=True))
def create_datapackage(path, update, overwrite): """create datapackage.json""" from ddf_utils.package import get_datapackage import json if not update and not overwrite: if os.path.exists(os.path.join(path, 'datapackage.json')): click.echo('datapackage.json already exists. use --update to update or --overwrite to create new') return res = get_datapackage(path, use_existing=False) else: if os.path.exists(os.path.join(path, 'datapackage.json')): click.echo('backing up previous datapackage.json...') # make a backup shutil.copy(os.path.join(path, 'datapackage.json'), os.path.join(path, 'datapackage.json.bak')) if overwrite: res = get_datapackage(path, use_existing=False) else: res = get_datapackage(path, use_existing=True, update=True) with open(os.path.join(path, 'datapackage.json'), 'w', encoding='utf8') as f: json.dump(res, f, indent=4, ensure_ascii=False) click.echo('Done.')
def _gen_dp(d): dp = get_datapackage(d, update=True) dump_json(osp.join(dataset, 'datapackage.json'), dp)
if series.dtype in numeric_dtypes: return 'measure' if series.dtype == 'bool': return 'boolean' return 'string' if __name__ == '__main__': df = pd.read_csv(source_path) # country entity country = df[['CountryCode', 'CountryName']] country = ddf_table(country, key=['country'], renames=renames, id_concepts=id_concepts) # datapoints indicator_cols = filter(lambda col: col not in ['CountryName'], df.columns) data = df[indicator_cols] data = ddf_table(data, key=['country','day'], renames=renames, id_concepts=id_concepts) # concepts concepts = get_concepts([country, data]) ddf_table(concepts, key=['concept']) # datapackage dp = get_datapackage(output_dir, update=True) dp['source'] = {} with open(sha_path, 'r') as f: dp['source']['sha'] = f.readline() dp_path = osp.join(output_dir, 'datapackage.json') dump_json(dp_path, dp)
imported.append('Geo') imported.append('Geo Name') imported.append('Name') imported.append('Year') imported_dict = dict([[k, concept_dict[k]] for k in imported]) concepts_df['concept'] = imported_dict.values() concepts_df['name'] = [x.strip() for x in imported_dict.keys()] concepts_df['concept_type'] = 'measure' concepts_df = concepts_df.set_index('concept') concepts_df.loc['geo', 'concept_type'] = 'entity_domain' concepts_df.loc['name', 'concept_type'] = 'string' concepts_df.loc['year', 'concept_type'] = 'time' concepts_df.loc['geo_name', 'concept_type'] = 'string' fn_concept = os.path.join(out_dir, 'ddf--concepts.csv') concepts_df.sort_values(by=['concept_type', 'name']).to_csv(fn_concept) # datapackage dp = get_datapackage(out_dir, use_existing=True, update=True) dump_json(os.path.join(out_dir, 'datapackage.json'), dp) print('tabs not imported:') for i in not_imported: print(i) print('If there are tabs should be imported here, please modify the script.') print('Done.')
def csvs_to_ddf(files, out_path): """convert raw files to ddfcsv Args ---- files: list a list of file paths to build ddf csv out_path: `str` the directory to put the ddf dataset """ import re from os.path import join from ddf_utils.str import to_concept_id concepts_df = pd.DataFrame([['name', 'Name', 'string']], columns=['concept', 'name', 'concept_type']) concepts_df = concepts_df.set_index('concept') all_entities = dict() pattern = r'indicators--by--([ 0-9a-zA-Z_-]*).csv' for f in files: data = pd.read_csv(f) basename = os.path.basename(f) keys = re.match(pattern, basename).groups()[0].split('--') keys_alphanum = list(map(to_concept_id, keys)) # check if there is a time column. Assume last column is time. try: pd.to_datetime(data[keys[-1]], format='%Y') except (ValueError, pd.tslib.OutOfBoundsDatetime): has_time = False else: has_time = True if has_time: ent_keys = keys[:-1] else: ent_keys = keys # set concept type for col in data.columns: concept = to_concept_id(col) if col in keys: if col in ent_keys: t = 'entity_domain' else: t = 'time' else: t = 'measure' concepts_df.loc[concept] = [col, t] for ent in ent_keys: ent_df = data[[ent]].drop_duplicates().copy() ent_concept = to_concept_id(ent) ent_df.columns = ['name'] ent_df[ent_concept] = ent_df.name.map(to_concept_id) if ent_concept not in all_entities.keys(): all_entities[ent_concept] = ent_df else: all_entities[ent_concept] = pd.concat([all_entities[ent_concept], ent_df], ignore_index=True) data = data.set_index(keys) for c in data: # output datapoints df = data[c].copy() df = df.reset_index() for k in keys[:-1]: df[k] = df[k].map(to_concept_id) df.columns = df.columns.map(to_concept_id) (df.dropna() .to_csv(join(out_path, 'ddf--datapoints--{}--by--{}.csv'.format( to_concept_id(c), '--'.join(keys_alphanum))), index=False)) # output concepts concepts_df.to_csv(join(out_path, 'ddf--concepts.csv')) # output entities for c, df in all_entities.items(): df.to_csv(join(out_path, 'ddf--entities--{}.csv'.format(c)), index=False) dp = get_datapackage(out_path, use_existing=False) dump_json(os.path.join(out_path, 'datapackage.json'), dp) return
cleanup(outputFolder) concepts = pd.DataFrame() # species entity domain inputFile = os.path.join(inputFolder, 'bulk', 'taxonomy.csv') outputFile = os.path.join(outputFolder, 'ddf--entities--species.csv') df = processEntityDomain(inputFile, conceptDict={ 'internalTaxonId': 'species' }) df.to_csv(outputFile, index=False, encoding='utf8') concepts = extractConcepts(df, concepts) # assessment entity domain inputFile = os.path.join(inputFolder, 'bulk', 'assessments.csv') outputFile = os.path.join(outputFolder, 'ddf--entities--assessment.csv') df = processEntityDomain(inputFile, conceptDict = { 'internalTaxonId': 'species', 'assessmentId': 'assessment'}, dropCols = ['scientificName'] ) # df = filterSpecies(df) df.to_csv(outputFile, index=False, encoding='utf8') concepts = extractConcepts(df, concepts) # concepts outputFile = os.path.join(outputFolder, 'ddf--concepts.csv') concepts = concepts.rename_axis('concept') concepts.to_csv(outputFile, index=True, encoding='utf8') # datapackage dump_json(os.path.join(outputFolder, 'datapackage.json'), get_datapackage(outputFolder, update=True))