示例#1
0
def test_create_datapackage_1():
    from ddf_utils.model.package import DDFcsv
    from ddf_utils.package import get_datapackage

    dataset_path = os.path.join(os.path.dirname(__file__),
                                'chef/datasets/ddf--gapminder--dummy_companies')

    dp1 = get_datapackage(dataset_path, use_existing=True)
    assert dp1['license'] == 'foo'

    dp2 = get_datapackage(dataset_path, use_existing=False)
    for k in ['name', 'title', 'author', 'description', 'language', 'license']:
        assert k in dp2.keys()

    dp_ = DDFcsv.from_path(dataset_path)
    dp_.get_ddf_schema(update=True)
    datapackage = dp_.to_dict()
    assert 'ddfSchema' in datapackage.keys()

    d = {
        "primaryKey": [
            "project"
        ],
        "value": None,
        "resources": [
            "ddf--entities--project"
        ]
    }
    assert d in datapackage['ddfSchema']['entities']
示例#2
0
def main():
    print('running etl...')
    data = pd.read_excel(open_google_spreadsheet(DOCID), sheet_name=SHEET)

    measures = list()

    for c, df in gen_datapoints(data):
        c_id = COLUMN_TO_CONCEPT[c]
        df.columns = [c_id]
        serve_datapoint(df, OUT_DIR, c_id)

        measures.append((c_id, c))

    measures_df = pd.DataFrame(measures, columns=['concept', 'name'])
    measures_df['concept_type'] = 'measure'

    dimensions_df = pd.DataFrame.from_dict(
        dict(concept=DIMENSIONS,
             name=list(map(str.title, DIMENSIONS)),
             concept_type=['entity_domain', 'time']))
    others_df = pd.DataFrame.from_dict(
        dict(concept=['name'], name=['name'], concept_type=['string']))
    (pd.concat([measures_df, dimensions_df, others_df],
               ignore_index=True).to_csv(osp.join(OUT_DIR,
                                                  'ddf--concepts.csv'),
                                         index=False))

    geo_df = create_geo_domain(data)
    geo_df.to_csv(osp.join(OUT_DIR, 'ddf--entities--geo.csv'), index=False)

    # datapackage
    dump_json(osp.join(OUT_DIR, 'datapackage.json'),
              get_datapackage(OUT_DIR, update=True))
示例#3
0
文件: cli.py 项目: semio/ddf_utils
def create_datapackage(path, update, overwrite):
    """create datapackage.json"""
    from ddf_utils.package import get_datapackage
    import json
    if not update and not overwrite:
        if os.path.exists(os.path.join(path, 'datapackage.json')):
            click.echo('datapackage.json already exists. use --update to update or --overwrite to create new')
            return
        res = get_datapackage(path, use_existing=False)
    else:
        if os.path.exists(os.path.join(path, 'datapackage.json')):
            click.echo('backing up previous datapackage.json...')
            # make a backup
            shutil.copy(os.path.join(path, 'datapackage.json'),
                        os.path.join(path, 'datapackage.json.bak'))
        if overwrite:
            res = get_datapackage(path, use_existing=False)
        else:
            res = get_datapackage(path, use_existing=True, update=True)

    with open(os.path.join(path, 'datapackage.json'), 'w', encoding='utf8') as f:
        json.dump(res, f, indent=4, ensure_ascii=False)
    click.echo('Done.')
示例#4
0
 def _gen_dp(d):
     dp = get_datapackage(d, update=True)
     dump_json(osp.join(dataset, 'datapackage.json'), dp)
    if series.dtype in numeric_dtypes:
        return 'measure'
    if series.dtype == 'bool':
        return 'boolean'
    return 'string'


if __name__ == '__main__':
    df = pd.read_csv(source_path)

    # country entity
    country = df[['CountryCode', 'CountryName']]
    country = ddf_table(country, key=['country'], renames=renames, id_concepts=id_concepts)

    # datapoints
    indicator_cols = filter(lambda col: col not in ['CountryName'], df.columns)
    data = df[indicator_cols]
    data = ddf_table(data, key=['country','day'], renames=renames, id_concepts=id_concepts)

    # concepts
    concepts = get_concepts([country, data])
    ddf_table(concepts, key=['concept'])

    # datapackage
    dp = get_datapackage(output_dir, update=True)
    dp['source'] = {}
    with open(sha_path, 'r') as f:
        dp['source']['sha'] = f.readline()
    dp_path = osp.join(output_dir, 'datapackage.json')
    dump_json(dp_path, dp)
示例#6
0
    imported.append('Geo')
    imported.append('Geo Name')
    imported.append('Name')
    imported.append('Year')

    imported_dict = dict([[k, concept_dict[k]] for k in imported])

    concepts_df['concept'] = imported_dict.values()
    concepts_df['name'] = [x.strip() for x in imported_dict.keys()]
    concepts_df['concept_type'] = 'measure'

    concepts_df = concepts_df.set_index('concept')
    concepts_df.loc['geo', 'concept_type'] = 'entity_domain'
    concepts_df.loc['name', 'concept_type'] = 'string'
    concepts_df.loc['year', 'concept_type'] = 'time'
    concepts_df.loc['geo_name', 'concept_type'] = 'string'

    fn_concept = os.path.join(out_dir, 'ddf--concepts.csv')
    concepts_df.sort_values(by=['concept_type', 'name']).to_csv(fn_concept)

    # datapackage
    dp = get_datapackage(out_dir, use_existing=True, update=True)
    dump_json(os.path.join(out_dir, 'datapackage.json'), dp)

    print('tabs not imported:')
    for i in not_imported:
        print(i)
    print('If there are tabs should be imported here, please modify the script.')
    print('Done.')
示例#7
0
文件: io.py 项目: semio/ddf_utils
def csvs_to_ddf(files, out_path):
    """convert raw files to ddfcsv

    Args
    ----
    files: list
        a list of file paths to build ddf csv
    out_path: `str`
        the directory to put the ddf dataset

    """
    import re
    from os.path import join
    from ddf_utils.str import to_concept_id

    concepts_df = pd.DataFrame([['name', 'Name', 'string']],
                               columns=['concept', 'name', 'concept_type'])
    concepts_df = concepts_df.set_index('concept')

    all_entities = dict()

    pattern = r'indicators--by--([ 0-9a-zA-Z_-]*).csv'

    for f in files:
        data = pd.read_csv(f)
        basename = os.path.basename(f)
        keys = re.match(pattern, basename).groups()[0].split('--')
        keys_alphanum = list(map(to_concept_id, keys))

        # check if there is a time column. Assume last column is time.
        try:
            pd.to_datetime(data[keys[-1]], format='%Y')
        except (ValueError, pd.tslib.OutOfBoundsDatetime):
            has_time = False
        else:
            has_time = True

        if has_time:
            ent_keys = keys[:-1]
        else:
            ent_keys = keys

        # set concept type
        for col in data.columns:
            concept = to_concept_id(col)

            if col in keys:
                if col in ent_keys:
                    t = 'entity_domain'
                else:
                    t = 'time'
            else:
                t = 'measure'

            concepts_df.loc[concept] = [col, t]

        for ent in ent_keys:
            ent_df = data[[ent]].drop_duplicates().copy()
            ent_concept = to_concept_id(ent)
            ent_df.columns = ['name']
            ent_df[ent_concept] = ent_df.name.map(to_concept_id)

            if ent_concept not in all_entities.keys():
                all_entities[ent_concept] = ent_df
            else:
                all_entities[ent_concept] = pd.concat([all_entities[ent_concept], ent_df],
                                                      ignore_index=True)

        data = data.set_index(keys)
        for c in data:
            # output datapoints
            df = data[c].copy()
            df = df.reset_index()
            for k in keys[:-1]:
                df[k] = df[k].map(to_concept_id)
            df.columns = df.columns.map(to_concept_id)
            (df.dropna()
               .to_csv(join(out_path,
                            'ddf--datapoints--{}--by--{}.csv'.format(
                                to_concept_id(c), '--'.join(keys_alphanum))),
                       index=False))

    # output concepts
    concepts_df.to_csv(join(out_path, 'ddf--concepts.csv'))

    # output entities
    for c, df in all_entities.items():
        df.to_csv(join(out_path, 'ddf--entities--{}.csv'.format(c)), index=False)

    dp = get_datapackage(out_path, use_existing=False)
    dump_json(os.path.join(out_path, 'datapackage.json'), dp)

    return
示例#8
0
cleanup(outputFolder)

concepts = pd.DataFrame()

# species entity domain
inputFile = os.path.join(inputFolder, 'bulk', 'taxonomy.csv')
outputFile = os.path.join(outputFolder, 'ddf--entities--species.csv')
df = processEntityDomain(inputFile, conceptDict={ 'internalTaxonId': 'species' })
df.to_csv(outputFile, index=False, encoding='utf8')
concepts = extractConcepts(df, concepts)

# assessment entity domain
inputFile = os.path.join(inputFolder, 'bulk', 'assessments.csv')
outputFile = os.path.join(outputFolder, 'ddf--entities--assessment.csv')
df = processEntityDomain(inputFile, 
    conceptDict = { 'internalTaxonId': 'species', 'assessmentId': 'assessment'},
    dropCols = ['scientificName']
)
# df = filterSpecies(df)
df.to_csv(outputFile, index=False, encoding='utf8')
concepts = extractConcepts(df, concepts)

# concepts
outputFile = os.path.join(outputFolder, 'ddf--concepts.csv')
concepts = concepts.rename_axis('concept')
concepts.to_csv(outputFile, index=True, encoding='utf8')

# datapackage
dump_json(os.path.join(outputFolder, 'datapackage.json'), get_datapackage(outputFolder, update=True))