def get_series(name=None): out_dir = h.get_output_path('rag') dfs_list = glob(out_dir + f'/*/*_{name or "all"}.csv') if len(dfs_list) == 0: return None dfs = [] for df_csv in dfs_list: df = pd.read_csv(df_csv) df = dashboard_helpers.mapped_publisher_name(df) df = df.groupby('publisher', as_index=False)['weighted score ratio'].mean() df = df.round({'weighted score ratio': 2}) df['weighted score ratio'] = df['weighted score ratio'].apply( lambda x: x * 100) df_date = df_csv.split('/')[-2] df['date'] = datetime.datetime.strptime(df_date, '%Y-%m-%d') dfs.append(df) return dfs
def get_df_series(name=None): out_dir = h.get_output_path('rag') dfs_list = glob(out_dir + f'/*/*_{name or "all"}.csv') dfs = [] for df_csv in dfs_list: df = pd.read_csv(df_csv) df_date = df_csv.split('/')[-2] df['date'] = df_date dfs.append(df) return dfs
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warn( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + name datasets_number = 0 resources_number = 0 for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{name}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{name}.data.json')
import json from pathlib import Path import re from collections import Counter import hashlib from edscrapers.cli import logger from edscrapers.transformers.base import helpers as h from edscrapers.scrapers.base.graph import GraphWrapper import igraph OUTPUT_DIR = os.getenv('ED_OUTPUT_PATH') # get the output directory # get this transformer's output directory CURRENT_TRANSFORMER_OUTPUT_DIR = h.get_output_path('collections') def transform(name=None, input_file=None): """ function is responsible for transforming raw datasets into Collections """ if not name: # user has not provided a scraper name to get collections with logger.error( 'Scraper/Office name not provided. Cannot generate collections') sys.exit(1) # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=name)
import os import sys import json import hashlib from pathlib import Path from collections import Counter import re from edscrapers.cli import logger from edscrapers.transformers.base import helpers as h from edscrapers.scrapers.base.graph import GraphWrapper OUTPUT_DIR = os.getenv('ED_OUTPUT_PATH') # get the output directory # get this transformer's output directory CURRENT_TRANSFORMER_OUTPUT_DIR = h.get_output_path('sources') def transform(name=None, input_file=None): """ function is responsible for transofrming raw datasets into Sources """ if not name: # user has not provided a scraper name to get collections with logger.error( 'Scraper/Office name not provided. Cannot generate collections') sys.exit(1) # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=f'{name}.collections')
def transform(name=None, input_file=None, use_raw_datasets=False) -> pd.DataFrame: """ function transforms the datajson/datasets into a dataframe/csv containig data to be used for RAG analyses on the efficacy of the scraping toolkit to get viable/usable structured data from the unstructured data source. The function by default operates on/utilises datajson i.e. the json that is ready to be ingested by the ckan harvester; However, setting 'use_raw_datasets' to True means the function will operate on the raw, parsed data which was scraped from the data source. PARAMETERS - name: if provided must correspond to the name of a scraper. if 'use_raw_datasets' is False, file with the format '<name>.data.json' will be located in the datajson subdirectory of 'ED_OUTPUT_PATH/transformers' and read. if 'use_raw_datasets' is True, dataset files contained in the 'name' scrapers subdirectory of the 'ED_OUTPUT_PATH/scrapers' will be read input_file: if provided mut be a file with list of datajson or dataset files to read. If no parameters are provided, which is the default behaviour; then all datajson files contained in datajson subdirectory of 'ED_OUTPUT_PATH/transformers' will be read. function returns the DataFrame containing the transformed datajson/dataset files """ file_list = [] # holds the list of files which contain datajson/dataset datasets_list = [] # holds the data jsons gotten from files if use_raw_datasets == True: # work on raw datasets if not input_file: # no input file provided # loop over directory structure if name: # loop over <name> scraper output e.g nces file_list = h.traverse_output(name) # datasets = list of all <name> files else: # loop over everything file_list = h.traverse_output(None) # datasets = list of all JSON files else: # input file provided # read input_file, which is a list of files with open(input_file, 'r') as fp: try: file_list = [line.rstrip() for line in fp] except Exception: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(None) else: # work with processed/transformed datajson if not input_file: # no input file provided if name: # name of processed datajson is provided so get the file path file_list.append( Path(h.get_output_path('datajson'), f'{name}.data.json')) else: # name of processed datajson not provided file_list.extend( Path(h.get_output_path('datajson')).glob('*.json')) else: # input file provided # read input_file, which is a list of files with open(input_file, 'r') as fp: try: file_list = [line.rstrip() for line in fp] except Exception: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list.extend( Path(h.get_output_path('datajson')).glob('*.json')) if use_raw_datasets == True: # work on raw datasets # read the contents in file_list for file_path in file_list: # read json from the file using helper data = h.read_file(file_path) # compute the weight score of the dataset compute_score(data, append_score=True, use_raw_datasets=True) datasets_list.append(data) else: # work with processed json data # read the contents in the file_list for file_path in file_list: # read json from file using helper function data = h.read_file(file_path) for dataset_dict in data.get( 'dataset', []): # loop through the datasets contained in data # compute the weighted score of the dataset compute_score(dataset_dict, append_score=True, use_raw_datasets=False) datasets_list.append(dataset_dict) if use_raw_datasets == True: # work on raw datasets # map the datasets to pandas format dataset_rows_list = map(lambda dataset: [dataset.get('publisher'),\ dataset.get('source_url'), \ dataset.get('_weighted_score'), \ dataset.get('_weighted_score_ratio')], datasets_list) else: # work on processed datajson # map the dataset to pandas format dataset_rows_list = map(lambda dataset: [dataset.get('publisher')['name'],\ dataset.get('scraped_from'), \ dataset.get('_weighted_score'), \ dataset.get('_weighted_score_ratio')], datasets_list) # create the pandas df weighted_datasets_scores_df = pd.DataFrame(dataset_rows_list, columns=[ 'publisher', 'source url', 'weighted score', 'weighted score ratio' ]) # create a df that incorporates domain info weighted_datasets_scores_df2 = pd.DataFrame(columns=['domain']) weighted_datasets_scores_df2['domain'] = weighted_datasets_scores_df.\ apply(lambda row: urllib.parse.\ urlparse(row['source url']).hostname.\ replace('www2.', 'www.').replace('www.', ''), axis=1) weighted_datasets_scores_df2['publisher'] = weighted_datasets_scores_df[ 'publisher'] weighted_datasets_scores_df2['source url'] = weighted_datasets_scores_df[ 'source url'] weighted_datasets_scores_df2[ 'weighted score'] = weighted_datasets_scores_df['weighted score'] weighted_datasets_scores_df2[ 'weighted score ratio'] = weighted_datasets_scores_df[ 'weighted score ratio'] # create the output csv file name output_dated_dir = os.path.join( OUTPUT_DIR, f'{dt.now().year}-{dt.now().month}-{dt.now().day}') Path(output_dated_dir).mkdir(parents=True, exist_ok=True) if use_raw_datasets == True: # use raw datasets output_filename = "datasets_weighted_scores_{}_raw.csv".format( name or "all") else: # use processed datajson output_filename = "datasets_weighted_scores_{}.csv".format(name or "all") # create the fullpath weer file will be written fullpath = os.path.join(OUTPUT_DIR, output_filename) # write the dataframe to csv weighted_datasets_scores_df2.to_csv(fullpath, index=False) weighted_datasets_scores_df2.to_csv(os.path.join(output_dated_dir, output_filename), index=False) # write the csv to S3 bucket h.upload_to_s3_if_configured(fullpath, f'{output_filename}') return weighted_datasets_scores_df2
""" module computes the weighted scores of data.json files and transforms the resources into a csv file with name 'datasets_weighted_scores_{yyyy_mm_dd}.csv' """ import urllib.parse import os import pandas as pd from datetime import datetime as dt from pathlib import Path import edscrapers.transformers.base.helpers as h from edscrapers.cli import logger from edscrapers.transformers.rag import DATASET_WEIGHTING_SYS, TOTAL_WEIGHT # import weighting system & total weight # get the output directory OUTPUT_DIR = h.get_output_path('rag') def transform(name=None, input_file=None, use_raw_datasets=False) -> pd.DataFrame: """ function transforms the datajson/datasets into a dataframe/csv containig data to be used for RAG analyses on the efficacy of the scraping toolkit to get viable/usable structured data from the unstructured data source. The function by default operates on/utilises datajson i.e. the json that is ready to be ingested by the ckan harvester; However, setting 'use_raw_datasets' to True means the function will operate on the raw, parsed data which was scraped from the data source.
def transform(name=None, input_file=None): print(name) file_list = [] # holds the list of files which contain datajson/dataset datasets_list = [] # holds the data jsons gotten from files if name: # name of processed datajson is provided so get the file path file_list.append(Path(h.get_output_path('datajson'), f'{name}.data.json')) else: # name of processed datajson not provided file_list.extend(Path(h.get_output_path('datajson')).glob('*.json')) # read the contents in the file_list for file_path in file_list: df = pd.DataFrame(columns=[ 'title', 'description', 'tags', 'modified' 'publisher', 'source_url', 'data_steward_email', 'name', 'access_level', 'bureauCode', 'programCode', 'license', 'spatial', 'categories', 'level_of_data' ]) if name: sheet_name = name else: sheet_name = file_path.name.split('.')[0].upper() # read json from file using helper function data = h.read_file(file_path) for dd in data.get('dataset', []): # loop through the datasets contained in data dfd = { 'name': dd.get('identifier', None), 'title': dd.get('title', None), 'description': dd.get('description', None), 'tags': ', '.join(dd['keyword']), 'modified': dd.get('modified', None), 'publisher': dd['publisher']['name'], 'source_url': dd['scraped_from'], 'data_steward_email': dd['contactPoint']['hasEmail'], 'access_level': dd.get('accessLevel', None), 'bureauCode': ', '.join(dd.get('bureauCode', [])), 'programCode': ', '.join(dd.get('programCode', [])), 'license': dd.get('license', None), 'spatial': dd.get('spatial', None), 'categories': ', '.join(dd.get('theme', [])), 'level_of_data': ', '.join(dd.get('levelOfData', [])), } # if df is None: # # On first run, initialize the datframe with the datajson structure # # TODO: Remove this hack, maybe, sometimes # df = pd.DataFrame(columns=dataset_dict.keys()) # datasets_list.append(dataset_dict) # print(dataset_dict['title']) df2 = pd.DataFrame([dfd.values()], columns=dfd.keys()) # print(df2) logger.debug(f"Dumping data for [{sheet_name}] {dd['identifier']}") df = df.append(df2, ignore_index=True) logger.debug(f"Dumping data for {file_path}") _add_to_spreadsheet(os.path.join(OUTPUT_DIR, 'datasets.xlsx'), sheet_name, df)
import urllib.parse import os import pandas as pd from datetime import datetime as dt from pathlib import Path import edscrapers.transformers.base.helpers as h from edscrapers.cli import logger # get the output directory OUTPUT_DIR = h.get_output_path('excel') def _add_to_spreadsheet(file_path, sheet_name, result): # write the result (dataframe) to an excel sheet if os.path.exists(file_path): # check if excel sheet exist writer_mode = 'a' # set write mode to append else: writer_mode = 'w' # set write mode to write with pd.ExcelWriter(file_path, engine="openpyxl", mode=writer_mode) as writer: result.to_excel(writer, sheet_name=sheet_name, index=False, engine='openpyxl') pass def transform(name=None, input_file=None): print(name) file_list = [] # holds the list of files which contain datajson/dataset
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + (name or 'all') # keep track/stata for item transformed datasets_number = 0 resources_number = 0 sources_number = 0 collections_number = 0 # loop through the list of filepaths to be transformed for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) if not dataset: # no dataset was returned (i.e. dataset probably marked for removal) continue catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) # TODO WORK FROM BELOW HERE # get the list of Sources for this catalog catalog_sources = list() try: # read the list of preprocessed (but still 'raw') Sources from file catalog_sources = read_file( f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json") # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object catalog_sources = _transform_preprocessed_sources(catalog_sources) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field' ) # add the list of Source objects to the catalog catalog.sources = catalog_sources or [] # update the number fo transformed Sources sources_number = len(catalog_sources or []) # get the list of Collections for this catalog catalog_collections = list() try: # read the list of preprocessed (but still 'raw') Collections from file catalog_collections = read_file( f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json" ) # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object catalog_collections = _transform_preprocessed_collections( catalog_collections) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field' ) # add the list of Collection objects to the catalog catalog.collections = catalog_collections or [] # update the number fo transformed Collections collections_number = len(catalog_collections or []) # validate the catalog object if not catalog.validate_catalog(pls_fix=True): logger.error(f"catalog validation Failed! Ending transform process") return logger.debug('{} Sources transformed.'.format(sources_number)) logger.debug('{} Collections transformed.'.format(collections_number)) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{(name or "all")}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')