def get_catalog_urls(json_catalog, agency_lookup={}): if agency_lookup == {}: agency_lookup = tools.load_agency_lookup() catalog_urls = [] for index, dataset in enumerate(json_catalog): dataset_url_dict = get_dataset_url_dict(dataset, agency_lookup, index) catalog_urls.append(dataset_url_dict) return catalog_urls
def get_catalog_urls(json_catalog, agency_lookup ={} ): if agency_lookup == {}: agency_lookup = tools.load_agency_lookup() catalog_urls = [] for index,dataset in enumerate(json_catalog): dataset_url_dict = get_dataset_url_dict(dataset, agency_lookup, index) catalog_urls.append(dataset_url_dict) return catalog_urls
def get_dataset_url_dict(dataset, agency_lookup={}, index=0): if agency_lookup == {}: agency_lookup = tools.load_agency_lookup() dataset_id = dataset.get('identifier','(Missing_identifier_'+str(index)+')') dataset_title = dataset.get('title' ,'(Missing_title_' +str(index)+')') dataset_urls = get_dataset_urls(dataset) dataset_bureau_code = dataset.get('bureauCode','[Other]')[0] # Take only 1st element of bureau_code list dataset_agency = agency_lookup.get(dataset_bureau_code,'Other') # FIXME: use tools.get_key_list(), because don't have to deal with hierarchy fluctuations dataset_url_dict = {} dataset_url_dict['id' ] = dataset_id dataset_url_dict['title' ] = dataset_title dataset_url_dict['agency' ] = dataset_agency dataset_url_dict['url' ] = dataset_urls return dataset_url_dict
def get_dataset_url_dict(dataset, agency_lookup={}, index=0): if agency_lookup == {}: agency_lookup = tools.load_agency_lookup() dataset_id = dataset.get('identifier', '(Missing_identifier_' + str(index) + ')') dataset_title = dataset.get('title', '(Missing_title_' + str(index) + ')') dataset_urls = get_dataset_urls(dataset) dataset_bureau_code = dataset.get( 'bureauCode', '[Other]')[0] # Take only 1st element of bureau_code list dataset_agency = agency_lookup.get(dataset_bureau_code, 'Other') # FIXME: use tools.get_key_list(), because don't have to deal with hierarchy fluctuations dataset_url_dict = {} dataset_url_dict['id'] = dataset_id dataset_url_dict['title'] = dataset_title dataset_url_dict['agency'] = dataset_agency dataset_url_dict['url'] = dataset_urls return dataset_url_dict
def build_catalog_urls_list(file_list): agency_lookup = tools.load_agency_lookup() global url_df url_df = pd.DataFrame(columns=['date', 'id', 'agency', 'url', 'url_index', 'url_status']) row_index = 0 global catalog_date_urls catalog_date_urls = {} for file_name in file_list: json_catalog = tools.load_file_json(file_name) catalog_urls = get_catalog_urls(json_catalog, agency_lookup) file_date_str = tools.parse_date(file_name) file_date = datetime.strptime(file_date_str,'%Y-%m-%d') catalog_date_urls[file_date_str] = catalog_urls # Append file just processed for dataset_urls in catalog_urls: for url_index,dataset_url in enumerate(dataset_urls['url']): url_df.loc[row_index] = [ file_date , dataset_urls['id'] , dataset_urls['agency'] , dataset_url , url_index , '' # 'url_status' ] row_index += 1
def build_catalog_urls_list(file_list): agency_lookup = tools.load_agency_lookup() global url_df url_df = pd.DataFrame( columns=['date', 'id', 'agency', 'url', 'url_index', 'url_status']) row_index = 0 global catalog_date_urls catalog_date_urls = {} for file_name in file_list: json_catalog = tools.load_file_json(file_name) catalog_urls = get_catalog_urls(json_catalog, agency_lookup) file_date_str = tools.parse_date(file_name) file_date = datetime.strptime(file_date_str, '%Y-%m-%d') catalog_date_urls[ file_date_str] = catalog_urls # Append file just processed for dataset_urls in catalog_urls: for url_index, dataset_url in enumerate(dataset_urls['url']): url_df.loc[row_index] = [ file_date, dataset_urls['id'], dataset_urls['agency'], dataset_url, url_index, '' # 'url_status' ] row_index += 1