dv_list = [ds['dataverse'] for ds in datasets_list] dv_list = set(dv_list) # Create directories for all dataverses and download the metadata for dv in dv_list: down_dataverse_dir = down_dir + '/dv_{0}'.format(dv) if not os.path.isdir(down_dataverse_dir): os.mkdir(down_dataverse_dir) resp_dv = api_down.get_dataverse(dv) write_file(down_dataverse_dir + '/dv_' + dv + '_metadata.json', json.dumps(resp_dv['data'])) # Loop over all datasets for ds in datasets_list: # Get metadata of dataset resp_ds = api_down.get_dataset(ds['doi']) identifier = ds['doi'].split('/')[1] # Create directory for dataset down_dataset_dir = down_dir + '/dv_' + ds[ 'dataverse'] + '/ds_' + identifier if not os.path.isdir(down_dataset_dir): os.mkdir(down_dataset_dir) # Save dataset metadata as json file filename_metadata = down_dataset_dir + '/ds_' + identifier + '_metadata.json' write_file(filename_metadata, json.dumps(resp_ds['data'])) # Loop over all datafiles of a dataset for df in resp_ds['data']['latestVersion']['files']: file_id = str(df['dataFile']['id'])
import requests from functools import reduce import matplotlib.pyplot as plt import math ## Acquring data from APIs # establish connection base_url = 'https://dataverse.harvard.edu/' api = Api(base_url) print(api.status) # get the digital object identifier for the Harvard Dataverse dataset DOI = "doi:10.7910/DVN/HIDLTK" # retrieve the contents of the dataset covid = api.get_dataset(DOI) covid_files_list = covid.json()['data']['latestVersion']['files'] for fileObject in covid_files_list: print("File name is {}; id is {}".format( fileObject["dataFile"]["filename"], fileObject["dataFile"]["id"])) # get data file US_states_cases_file = api.get_datafile("4201597") # convert in_text = US_states_cases_file.content tmp = "US_states_cases.tab" f = open(tmp, "wb")
def get_fsp_data_through_api(base_url, identifier): ''' Takes base URL and identifier of the FSP data, and returns the Pandas dataframe of the file Input base_url (str): URL of the website identifier (str): identifier of the desired data file Output df (Pandas dataframe): dataframe of the FSP data ''' dtype_col = { 'FormName': 'str', 'County': 'str', 'GPSLatitude': 'float32', 'GPSLongitude': 'float32' } geo_columns = list(dtype_col.keys()) api = Api(base_url) resp_dataset = api.get_dataset(identifier) files = json.loads(resp_dataset.text)['data']['latestVersion']['files'] df = pd.DataFrame({col: [] for col in geo_columns}) for file in files: file_id = file['dataFile']['id'] resp_datafile = api.get_datafile(file_id) file_extension = file['dataFile']['filename'].split('.')[-1] if file_extension == 'tab': rows = resp_datafile.text.split('\n') headers = rows[0].split('\t') data_rows = \ [row.replace('"', '').split('\t') for row in rows[1:] if row != '' and row.split('\t')[headers.index('GPSLatitude')] != ''] df_file = \ pd.DataFrame(data_rows, columns=headers)[geo_columns].astype(dtype_col) elif file_extension == 'xlsx': workbook = xlrd.open_workbook(file_contents=resp_datafile.content) worksheet = workbook.sheet_by_index(0) col_names = [ col_name.replace(" ", "") for col_name in worksheet.row_values(0) ] df_file = pd.DataFrame({col: [] for col in geo_columns}) for col in geo_columns: data_col = worksheet.col_values(col_names.index(col), start_rowx=1) for idx_data, data in enumerate(data_col): if type(data) == str: data_col[idx_data] = data.replace('"', '') if data in ['', '--']: data_col[idx_data] = 'nan' df_file[col] = pd.Series(data_col, dtype=dtype_col[col]) df = df.append(df_file[df_file['County'] != 'nan'], ignore_index=True) df['geometry'] = \ df.apply(lambda x: Point(float(x['GPSLongitude']), float(x['GPSLatitude'])), axis=1) return df