예제 #1
0
            # Save dataset metadata as json file
            filename_metadata = down_dataset_dir + '/ds_' + identifier + '_metadata.json'
            write_file(filename_metadata, json.dumps(resp_ds['data']))

            # Loop over all datafiles of a dataset
            for df in resp_ds['data']['latestVersion']['files']:
                file_id = str(df['dataFile']['id'])

                # Create directory for datafile
                datafile_dir = down_dataset_dir + '/df_' + file_id
                if not os.path.isdir(datafile_dir):
                    os.mkdir(datafile_dir)

                # Download and save datafile file
                resp = api_down.get_datafile(file_id, 'content')
                filename_datafile = datafile_dir + '/df_' + str(
                    df['dataFile']['filename'])
                write_file(filename_datafile, resp.content, 'wb')

    if UPLOAD_DATA:
        CREATE_DV = False
        DELETE_DV = False
        CREATE_DS = False
        ADD_FILE = False
        DELETE_DS = False
        CREATE_DF = False

        api_token_up = os.environ["API_TOKEN_UP"]
        api_host_up = os.environ["API_HOST_UP"]
        api_up = Api(api_host_up, api_token=api_token_up, use_https=False)
예제 #2
0
print(api.status)

# get the digital object identifier for the Harvard Dataverse dataset
DOI = "doi:10.7910/DVN/HIDLTK"

# retrieve the contents of the dataset
covid = api.get_dataset(DOI)

covid_files_list = covid.json()['data']['latestVersion']['files']

for fileObject in covid_files_list:
    print("File name is {}; id is {}".format(
        fileObject["dataFile"]["filename"], fileObject["dataFile"]["id"]))

# get data file
US_states_cases_file = api.get_datafile("4201597")

# convert
in_text = US_states_cases_file.content
tmp = "US_states_cases.tab"

f = open(tmp, "wb")
f.write(in_text)
f.close()

US_states_cases = pd.read_csv(tmp, sep='\t')

print(US_states_cases.head(10))

## Cleaning data
# select columns of interest
def get_fsp_data_through_api(base_url, identifier):
    '''
    Takes base URL and identifier of the FSP data,
    and returns the Pandas dataframe of the file

    Input
        base_url (str): URL of the website
        identifier (str): identifier of the desired data file

    Output
        df (Pandas dataframe): dataframe of the FSP data
    '''

    dtype_col = {
        'FormName': 'str',
        'County': 'str',
        'GPSLatitude': 'float32',
        'GPSLongitude': 'float32'
    }
    geo_columns = list(dtype_col.keys())

    api = Api(base_url)
    resp_dataset = api.get_dataset(identifier)

    files = json.loads(resp_dataset.text)['data']['latestVersion']['files']
    df = pd.DataFrame({col: [] for col in geo_columns})

    for file in files:
        file_id = file['dataFile']['id']
        resp_datafile = api.get_datafile(file_id)
        file_extension = file['dataFile']['filename'].split('.')[-1]
        if file_extension == 'tab':
            rows = resp_datafile.text.split('\n')
            headers = rows[0].split('\t')
            data_rows = \
            [row.replace('"', '').split('\t')
             for row in rows[1:] if row != ''
             and row.split('\t')[headers.index('GPSLatitude')] != '']
            df_file = \
            pd.DataFrame(data_rows,
                         columns=headers)[geo_columns].astype(dtype_col)
        elif file_extension == 'xlsx':
            workbook = xlrd.open_workbook(file_contents=resp_datafile.content)
            worksheet = workbook.sheet_by_index(0)
            col_names = [
                col_name.replace(" ", "")
                for col_name in worksheet.row_values(0)
            ]
            df_file = pd.DataFrame({col: [] for col in geo_columns})
            for col in geo_columns:
                data_col = worksheet.col_values(col_names.index(col),
                                                start_rowx=1)
                for idx_data, data in enumerate(data_col):
                    if type(data) == str:
                        data_col[idx_data] = data.replace('"', '')
                    if data in ['', '--']:
                        data_col[idx_data] = 'nan'
                df_file[col] = pd.Series(data_col, dtype=dtype_col[col])

        df = df.append(df_file[df_file['County'] != 'nan'], ignore_index=True)

    df['geometry'] = \
    df.apply(lambda x: Point(float(x['GPSLongitude']),
                             float(x['GPSLatitude'])), axis=1)

    return df