def _generate_scraper_outputs_df(self, use_dump=False): def abs_url(url, source_url): if url.startswith( ('../', './', '/')) or not urllib.parse.urlparse(url).scheme: full_url = urllib.parse.urljoin(source_url, url) return full_url else: return url if self.deduplicated_list_path is None: files = traverse_output() else: try: with open(self.deduplicated_list_path, 'r') as fp: files = [pathlib.Path(line.rstrip()) for line in fp] except: files = traverse_output() df_dump = str( pathlib.Path( os.path.join(os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'))) if use_dump: df = pd.read_csv(df_dump) else: dfs = [] for fp in files: with open(fp, 'r') as json_file: try: j = json.load(json_file) # if it's marked for removal by the sanitizer, skip it if j.get('_clean_data', dict()).get('_remove_dataset'): logger.debug(f"Ignoring {j.get('source_url')}") continue j = [{ 'url': abs_url(r['url'], r['source_url']), 'source_url': r['source_url'], 'publisher': str(j['publisher']), 'size': r.get('headers', dict()).get('content-length', 0), 'scraper': fp.parent.name } for r in j['resources'] if r['source_url'].find('/print/') == -1] dfs.append(pd.read_json(json.dumps(j))) except Exception as e: logger.warning( f'Could not parse file {json_file} as JSON! {e}') df = pd.concat(dfs, ignore_index=True) df.to_csv(df_dump, index=False) return df
def _make_list(self, key): for f in self.file_list: with open(f, 'r') as fp: try: j = json.loads(fp.read()) except Exception as e: logger.warning(f'Failed to parse file {f} as JSON!') if '/print/' in j.get(key): continue # In order to deduplicate with dicts, we need to normalize all keys self.urls_dict[self._normalize_url(j.get(key)) + '_' + j.get('name')] = str(f)
def transform(name=None, input_file=None): if input_file is None: file_list = h.traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(name) # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # mark as private datasets that have certain keywords in their data data = _mark_private(data, search_words=[ 'conference', 'awards', 'user guide', 'applications' ]) # mark of removal datasets that have certain keywords data = _remove_dataset( data, search_words=['photo', 'foto', 'photos', 'fotos']) # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY # 1. remove 'table [0-9].' from beginning of dataset title data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.', dict_key='title') # set the 'level of data' for the dataset data = _set_dataset_level_of_data(data) # assign the dataset to groups # according to https://www2.ed.gov/rschstat/catalog/index.html data = _set_dataset_groups(data) # remove the old format for collections / sourcs data = _remove_old_sources_collections(data) # write modified dataset back to file h.write_file(file_path, data)
def _generate_scraper_outputs_df(self, use_dump=False): def abs_url(url, source_url): if url.startswith( ('../', './', '/')) or not urllib.parse.urlparse(url).scheme: full_url = urllib.parse.urljoin(source_url, url) return full_url else: return url if self.deduplicated_list_path is None: files = traverse_output() else: try: with open(self.deduplicated_list_path, 'r') as fp: files = [pathlib.Path(line.rstrip()) for line in fp] except: files = traverse_output() df_dump = str( pathlib.Path( os.path.join(os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'))) if use_dump: df = pd.read_csv(df_dump) else: dfs = [] for fp in files: # TODO refactor these rules or the files structure if 'data.json' in str(fp): continue with open(fp, 'r') as json_file: try: j = json.load(json_file) j = [{ 'url': abs_url(r['url'], r['source_url']), 'source_url': r['source_url'], 'scraper': fp.parent.name } for r in j['resources'] if r['source_url'].find('/print/') == -1] dfs.append(pd.read_json(json.dumps(j))) except: logger.warning( f'Could not parse file {json_file} as JSON!') df = pd.concat(dfs, ignore_index=True) df.to_csv(df_dump, index=False) return df
def transform(name=None, input_file=None): """ function is responsible for transofrming raw datasets into Sources """ if input_file is None: # no input file specified file_list = h.traverse_output( name) # run through all the files in 'name' directory else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(name) sources_list = [ ] # holds the list of sources acquired from 'name' scraper directory # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # retrieve source from dataset source = extract_source_from(dataset=data, use_key='collection') if not source: # source could not be retrieved continue # add source to list sources_list.append(source) # get a list of non-duplicate Sources sources_list = get_distinct_sources_from(sources_list, min_occurence_counter=2) # get the path were the gotten Sources will be saved to on local disk file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.sources.json' # write to file the Sources gotten from 'name' scraped output h.write_file(file_output_path, sources_list) # write file the Sources gotten from 'name' scraped out to S3 bucket h.upload_to_s3_if_configured(file_output_path, f'{(name or "all")}.sources.json')
def list_datasets_per_scraper(self, ordered=True): """Generate page count per domain PARAMETERS - ordered: whether the resulting DataFrame or Excel sheet result be sorted/ordered. If True, order by 'page count' """ filenames = [] try: with open(self.deduplicated_list_path, 'r') as fp: filenames = fp.readlines() except: logger.warning( 'Warning! Cannot read deduplication results. Please run deduplicate transformer first' ) filenames = traverse_output() scraper_counts = {} for filename in filenames: scraper_name = str(filename).rstrip().split('/')[-2] scraper_counts[scraper_name] = ( scraper_counts.get(scraper_name, 0) + 1) df = pd.DataFrame(columns=['scraper', 'dataset count']) df['scraper'] = list(scraper_counts.keys()) df['dataset count'] = list(scraper_counts.values()) if ordered: df.sort_values(by='dataset count', axis='index', ascending=False, inplace=True, ignore_index=True) self._add_to_spreadsheet(sheet_name='DATASET COUNT PER SCRAPER', result=df) return df
def transform(name=None, input_file=None, use_raw_datasets=False) -> pd.DataFrame: """ function transforms the datajson/datasets into a dataframe/csv containig data to be used for RAG analyses on the efficacy of the scraping toolkit to get viable/usable structured data from the unstructured data source. The function by default operates on/utilises datajson i.e. the json that is ready to be ingested by the ckan harvester; However, setting 'use_raw_datasets' to True means the function will operate on the raw, parsed data which was scraped from the data source. PARAMETERS - name: if provided must correspond to the name of a scraper. if 'use_raw_datasets' is False, file with the format '<name>.data.json' will be located in the datajson subdirectory of 'ED_OUTPUT_PATH/transformers' and read. if 'use_raw_datasets' is True, dataset files contained in the 'name' scrapers subdirectory of the 'ED_OUTPUT_PATH/scrapers' will be read input_file: if provided mut be a file with list of datajson or dataset files to read. If no parameters are provided, which is the default behaviour; then all datajson files contained in datajson subdirectory of 'ED_OUTPUT_PATH/transformers' will be read. function returns the DataFrame containing the transformed datajson/dataset files """ file_list = [] # holds the list of files which contain datajson/dataset datasets_list = [] # holds the data jsons gotten from files if use_raw_datasets == True: # work on raw datasets if not input_file: # no input file provided # loop over directory structure if name: # loop over <name> scraper output e.g nces file_list = h.traverse_output(name) # datasets = list of all <name> files else: # loop over everything file_list = h.traverse_output(None) # datasets = list of all JSON files else: # input file provided # read input_file, which is a list of files with open(input_file, 'r') as fp: try: file_list = [line.rstrip() for line in fp] except Exception: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(None) else: # work with processed/transformed datajson if not input_file: # no input file provided if name: # name of processed datajson is provided so get the file path file_list.append( Path(h.get_output_path('datajson'), f'{name}.data.json')) else: # name of processed datajson not provided file_list.extend( Path(h.get_output_path('datajson')).glob('*.json')) else: # input file provided # read input_file, which is a list of files with open(input_file, 'r') as fp: try: file_list = [line.rstrip() for line in fp] except Exception: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list.extend( Path(h.get_output_path('datajson')).glob('*.json')) if use_raw_datasets == True: # work on raw datasets # read the contents in file_list for file_path in file_list: # read json from the file using helper data = h.read_file(file_path) # compute the weight score of the dataset compute_score(data, append_score=True, use_raw_datasets=True) datasets_list.append(data) else: # work with processed json data # read the contents in the file_list for file_path in file_list: # read json from file using helper function data = h.read_file(file_path) for dataset_dict in data.get( 'dataset', []): # loop through the datasets contained in data # compute the weighted score of the dataset compute_score(dataset_dict, append_score=True, use_raw_datasets=False) datasets_list.append(dataset_dict) if use_raw_datasets == True: # work on raw datasets # map the datasets to pandas format dataset_rows_list = map(lambda dataset: [dataset.get('publisher'),\ dataset.get('source_url'), \ dataset.get('_weighted_score'), \ dataset.get('_weighted_score_ratio')], datasets_list) else: # work on processed datajson # map the dataset to pandas format dataset_rows_list = map(lambda dataset: [dataset.get('publisher')['name'],\ dataset.get('scraped_from'), \ dataset.get('_weighted_score'), \ dataset.get('_weighted_score_ratio')], datasets_list) # create the pandas df weighted_datasets_scores_df = pd.DataFrame(dataset_rows_list, columns=[ 'publisher', 'source url', 'weighted score', 'weighted score ratio' ]) # create a df that incorporates domain info weighted_datasets_scores_df2 = pd.DataFrame(columns=['domain']) weighted_datasets_scores_df2['domain'] = weighted_datasets_scores_df.\ apply(lambda row: urllib.parse.\ urlparse(row['source url']).hostname.\ replace('www2.', 'www.').replace('www.', ''), axis=1) weighted_datasets_scores_df2['publisher'] = weighted_datasets_scores_df[ 'publisher'] weighted_datasets_scores_df2['source url'] = weighted_datasets_scores_df[ 'source url'] weighted_datasets_scores_df2[ 'weighted score'] = weighted_datasets_scores_df['weighted score'] weighted_datasets_scores_df2[ 'weighted score ratio'] = weighted_datasets_scores_df[ 'weighted score ratio'] # create the output csv file name output_dated_dir = os.path.join( OUTPUT_DIR, f'{dt.now().year}-{dt.now().month}-{dt.now().day}') Path(output_dated_dir).mkdir(parents=True, exist_ok=True) if use_raw_datasets == True: # use raw datasets output_filename = "datasets_weighted_scores_{}_raw.csv".format( name or "all") else: # use processed datajson output_filename = "datasets_weighted_scores_{}.csv".format(name or "all") # create the fullpath weer file will be written fullpath = os.path.join(OUTPUT_DIR, output_filename) # write the dataframe to csv weighted_datasets_scores_df2.to_csv(fullpath, index=False) weighted_datasets_scores_df2.to_csv(os.path.join(output_dated_dir, output_filename), index=False) # write the csv to S3 bucket h.upload_to_s3_if_configured(fullpath, f'{output_filename}') return weighted_datasets_scores_df2
def transform(name=None, input_file=None): if input_file is None: file_list = h.traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning(f'Cannot read from list of output files at {input_file}, falling back to all collected data!') file_list = h.traverse_output(name) # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # skip the dataset that has only txt resources if _dataset_only_has_txt_resources(data): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Remove datasets with no resources or no relevant resources if not len(_filter_resources_list(data['resources'])) or not len(data['resources']): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Special hacks for ed.gov data if name == 'edgov': clean_data = {} clean_data['_remove_dataset'] = False # unmark dataset for removal # # Get the publisher name # try: # publisher_name = data['publisher'].get('name') # except: # publisher_name = data['publisher'] # Check for "bad" URLs and remove them bad_subdomains = ['dashboard', 'rems'] if any([f'{bs}.ed.gov' in data['source_url'] for bs in bad_subdomains]): clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # OESE hack. Remove datasets outside oese.ed.gov domain publisher = data.get('publisher') publisher_name = "" if type(publisher) == dict: publisher_name = publisher.get('name') elif type(publisher) == str: publisher_name = publisher if publisher_name in ['oese', 'Office of Elementary and Secondary Education', 'Office of Elementary and Secondary Education (OESE)']: if _dataset_outside_oese_domain(data): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Remove duplicate identifiers generated by duplicate URLs in IES/NCES if publisher_name in ['ies', 'Institute of Education Sciences (IES)', 'National Center for Education Statistics (NCES)', 'nces']: if data.get('source_url').endswith('current=yes'): clean_data = data['_clean_data'] clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Filter resources data = _filter_dataset_resources(data) # mark as private datasets that have certain keywords in their data data = _mark_private(data, search_words=['conference', 'awards', 'user guide', 'applications']) # mark of removal datasets that have certain keywords data = _remove_dataset(data, search_words=['photo', 'foto', 'photos', 'fotos']) # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY # 1. remove 'table [0-9].' from beginning of dataset title data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.', dict_key='title') # set the 'level of data' for the dataset data = _set_dataset_level_of_data(data) # assign the dataset to groups # according to https://www2.ed.gov/rschstat/catalog/index.html data = _set_dataset_groups(data) # remove the old format for collections / sourcs data = _remove_old_sources_collections(data) # write modified dataset back to file h.write_file(file_path, data)
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + (name or 'all') # keep track/stata for item transformed datasets_number = 0 resources_number = 0 sources_number = 0 collections_number = 0 # loop through the list of filepaths to be transformed for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) if not dataset: # no dataset was returned (i.e. dataset probably marked for removal) continue catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) # TODO WORK FROM BELOW HERE # get the list of Sources for this catalog catalog_sources = list() try: # read the list of preprocessed (but still 'raw') Sources from file catalog_sources = read_file( f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json") # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object catalog_sources = _transform_preprocessed_sources(catalog_sources) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field' ) # add the list of Source objects to the catalog catalog.sources = catalog_sources or [] # update the number fo transformed Sources sources_number = len(catalog_sources or []) # get the list of Collections for this catalog catalog_collections = list() try: # read the list of preprocessed (but still 'raw') Collections from file catalog_collections = read_file( f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json" ) # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object catalog_collections = _transform_preprocessed_collections( catalog_collections) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field' ) # add the list of Collection objects to the catalog catalog.collections = catalog_collections or [] # update the number fo transformed Collections collections_number = len(catalog_collections or []) # validate the catalog object if not catalog.validate_catalog(pls_fix=True): logger.error(f"catalog validation Failed! Ending transform process") return logger.debug('{} Sources transformed.'.format(sources_number)) logger.debug('{} Collections transformed.'.format(collections_number)) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{(name or "all")}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')