def get_compare_dict(self): json_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/compare-statistics.json?alt=media' json_s3_file = os.path.join(os.getenv('ED_OUTPUT_PATH'), 'tools', 'stats', 's3_compare-statistics.json') json_local_file = os.path.join(os.getenv('ED_OUTPUT_PATH'), 'statistics.json') try: req = requests.get(json_url) req.raise_for_status() with open(json_s3_file, 'wb') as json_file: json_file.write(req.content) except: pass try: result = json.loads(json_s3_file) except JSONDecodeError: try: with open(json_local_file) as json_file: result = json.load(json_file) except FileNotFoundError: logger.error('Comparison statistics JSON not found!') raise return result
def __init__(self): logger.debug("Creating statistics...") if os.path.exists( self.METRICS_OUTPUT_XLSX): # check if excel sheet exist os.remove(self.METRICS_OUTPUT_XLSX) # remove the excel sheet try: self.datopian_out_df = pd.read_csv(os.path.join( os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'), header=0) except Exception as e: logger.error( 'Could not load the Datopian CSV, please generate it first.') # read the AIR csv into a dataframe try: air_csv_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/AIR.csv?alt=media' req = requests.get(air_csv_url) air_df_path = pathlib.Path(os.getenv('ED_OUTPUT_PATH'), 'tools', "stats", 'data', 'air_df.csv') # make the required path/directories pathlib.Path.resolve(air_df_path).parent.mkdir(parents=True, exist_ok=True) # write the downloded file to disk with open(air_df_path, 'wb') as air_df_file: air_df_file.write(req.content) self.air_out_df = pd.read_csv(air_df_path, header=0) except Exception as e: logger.error('Could not load the AIR CSV.')
def transform(name=None, input_file=None): """ function is responsible for transforming raw datasets into Collections """ if not name: # user has not provided a scraper name to get collections with logger.error('Scraper/Office name not provided. Cannot generate collections') sys.exit(1) try: # load the Graph representing the deduplicated scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=f'{name}.deduplicate') except: # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=name) # get the loaded graph graph = GraphWrapper.get_graph() # identify collections within the graph identify_collections_within_graph(graph) # link dataset vertices to their appropriate collection(s) within the graph link_datasets_to_collections_in_graph(graph) # write the identified collections to the raw dataset files add_collections_to_raw_datasets(graph=graph, output_dir=OUTPUT_DIR) # write the graph to files # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the page legend file for this graph GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the collections.json file collections_list = [] # holds the list of collections acquired from graph with graph.graph_lock: for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'): collections_list.append({'collection_id': collection['collection_id'], 'collection_title': collection['title'], 'collection_url': collection['name']}) # get a list of non-duplicate collections collections_list = get_distinct_collections_from(collections_list, min_occurence_counter=1) # get the path were the gotten Collections will be saved to on local disk file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json' # write to file the collections gotten from 'name' scraped output h.write_file(file_output_path, collections_list) # write file the collections gotten from 'name' scraped out to S3 bucket h.upload_to_s3_if_configured(file_output_path, f'{(name or "all")}.collections.json')
def read_json_file(): try: with open(file_path) as json_file: data = json.load(json_file) except: logger.error('Cannot read statistics.json file!') return data
def parse(res): """ function parses content to create a dataset model or return None if no resource in content""" # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None # check if the content contains any of the extensions if soup_parser.body.find(name='a', href=base_parser.resource_checker, recursive=True) is None: # no resource on this page, so return None return None # if code gets here, at least one resource was found # check if the parser is working on OSERS web page if soup_parser.body.find(name='div', id='maincontent', recursive=True) is not None: # parse the page with the parser and return result return parsers.parser1.parse(res) # check if the parser is working on OCTAE web page (variant 2) if soup_parser.body.select_one('.headersLevel1') is not None: # parse the page with the parser and return result return parsers.parser2.parse(res) else: logger.error('Page doesnt fit in any structure:') logger.error(res) return None
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + (name or 'all') # keep track/stata for item transformed datasets_number = 0 resources_number = 0 sources_number = 0 collections_number = 0 # loop through the list of filepaths to be transformed for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) if not dataset: # no dataset was returned (i.e. dataset probably marked for removal) continue catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) # TODO WORK FROM BELOW HERE # get the list of Sources for this catalog catalog_sources = list() try: # read the list of preprocessed (but still 'raw') Sources from file catalog_sources = read_file( f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json") # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object catalog_sources = _transform_preprocessed_sources(catalog_sources) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field' ) # add the list of Source objects to the catalog catalog.sources = catalog_sources or [] # update the number fo transformed Sources sources_number = len(catalog_sources or []) # get the list of Collections for this catalog catalog_collections = list() try: # read the list of preprocessed (but still 'raw') Collections from file catalog_collections = read_file( f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json" ) # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object catalog_collections = _transform_preprocessed_collections( catalog_collections) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field' ) # add the list of Collection objects to the catalog catalog.collections = catalog_collections or [] # update the number fo transformed Collections collections_number = len(catalog_collections or []) # validate the catalog object if not catalog.validate_catalog(pls_fix=True): logger.error(f"catalog validation Failed! Ending transform process") return logger.debug('{} Sources transformed.'.format(sources_number)) logger.debug('{} Collections transformed.'.format(collections_number)) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{(name or "all")}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')
import os import sys import json from edscrapers.cli import logger OUTPUT_PATH = os.getenv('ED_OUTPUT_PATH') try: file_path = os.path.join(OUTPUT_PATH, 'statistics.json') except TypeError: logger.error('ED_OUTPUT_PATH env var not set!') sys.exit(1) def read_json_file(): try: with open(file_path) as json_file: data = json.load(json_file) except: logger.error('Cannot read statistics.json file!') return data def get_stats(): return read_json_file() def get_total_datasets_number(): data = read_json_file() return data['total']['datopian']['datasets'] def get_total_datasets_data():