예제 #1
0
        dv_list = [ds['dataverse'] for ds in datasets_list]
        dv_list = set(dv_list)

        # Create directories for all dataverses and download the metadata
        for dv in dv_list:
            down_dataverse_dir = down_dir + '/dv_{0}'.format(dv)
            if not os.path.isdir(down_dataverse_dir):
                os.mkdir(down_dataverse_dir)
            resp_dv = api_down.get_dataverse(dv)
            write_file(down_dataverse_dir + '/dv_' + dv + '_metadata.json',
                       json.dumps(resp_dv['data']))

        # Loop over all datasets
        for ds in datasets_list:
            # Get metadata of dataset
            resp_ds = api_down.get_dataset(ds['doi'])
            identifier = ds['doi'].split('/')[1]

            # Create directory for dataset
            down_dataset_dir = down_dir + '/dv_' + ds[
                'dataverse'] + '/ds_' + identifier
            if not os.path.isdir(down_dataset_dir):
                os.mkdir(down_dataset_dir)

            # Save dataset metadata as json file
            filename_metadata = down_dataset_dir + '/ds_' + identifier + '_metadata.json'
            write_file(filename_metadata, json.dumps(resp_ds['data']))

            # Loop over all datafiles of a dataset
            for df in resp_ds['data']['latestVersion']['files']:
                file_id = str(df['dataFile']['id'])
예제 #2
0
import requests
from functools import reduce
import matplotlib.pyplot as plt
import math

## Acquring data from APIs
# establish connection
base_url = 'https://dataverse.harvard.edu/'
api = Api(base_url)
print(api.status)

# get the digital object identifier for the Harvard Dataverse dataset
DOI = "doi:10.7910/DVN/HIDLTK"

# retrieve the contents of the dataset
covid = api.get_dataset(DOI)

covid_files_list = covid.json()['data']['latestVersion']['files']

for fileObject in covid_files_list:
    print("File name is {}; id is {}".format(
        fileObject["dataFile"]["filename"], fileObject["dataFile"]["id"]))

# get data file
US_states_cases_file = api.get_datafile("4201597")

# convert
in_text = US_states_cases_file.content
tmp = "US_states_cases.tab"

f = open(tmp, "wb")
def get_fsp_data_through_api(base_url, identifier):
    '''
    Takes base URL and identifier of the FSP data,
    and returns the Pandas dataframe of the file

    Input
        base_url (str): URL of the website
        identifier (str): identifier of the desired data file

    Output
        df (Pandas dataframe): dataframe of the FSP data
    '''

    dtype_col = {
        'FormName': 'str',
        'County': 'str',
        'GPSLatitude': 'float32',
        'GPSLongitude': 'float32'
    }
    geo_columns = list(dtype_col.keys())

    api = Api(base_url)
    resp_dataset = api.get_dataset(identifier)

    files = json.loads(resp_dataset.text)['data']['latestVersion']['files']
    df = pd.DataFrame({col: [] for col in geo_columns})

    for file in files:
        file_id = file['dataFile']['id']
        resp_datafile = api.get_datafile(file_id)
        file_extension = file['dataFile']['filename'].split('.')[-1]
        if file_extension == 'tab':
            rows = resp_datafile.text.split('\n')
            headers = rows[0].split('\t')
            data_rows = \
            [row.replace('"', '').split('\t')
             for row in rows[1:] if row != ''
             and row.split('\t')[headers.index('GPSLatitude')] != '']
            df_file = \
            pd.DataFrame(data_rows,
                         columns=headers)[geo_columns].astype(dtype_col)
        elif file_extension == 'xlsx':
            workbook = xlrd.open_workbook(file_contents=resp_datafile.content)
            worksheet = workbook.sheet_by_index(0)
            col_names = [
                col_name.replace(" ", "")
                for col_name in worksheet.row_values(0)
            ]
            df_file = pd.DataFrame({col: [] for col in geo_columns})
            for col in geo_columns:
                data_col = worksheet.col_values(col_names.index(col),
                                                start_rowx=1)
                for idx_data, data in enumerate(data_col):
                    if type(data) == str:
                        data_col[idx_data] = data.replace('"', '')
                    if data in ['', '--']:
                        data_col[idx_data] = 'nan'
                df_file[col] = pd.Series(data_col, dtype=dtype_col[col])

        df = df.append(df_file[df_file['County'] != 'nan'], ignore_index=True)

    df['geometry'] = \
    df.apply(lambda x: Point(float(x['GPSLongitude']),
                             float(x['GPSLatitude'])), axis=1)

    return df