def __init__(self): logger.debug("Creating statistics...") if os.path.exists( self.METRICS_OUTPUT_XLSX): # check if excel sheet exist os.remove(self.METRICS_OUTPUT_XLSX) # remove the excel sheet try: self.datopian_out_df = pd.read_csv(os.path.join( os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'), header=0) except Exception as e: logger.error( 'Could not load the Datopian CSV, please generate it first.') # read the AIR csv into a dataframe try: air_csv_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/AIR.csv?alt=media' req = requests.get(air_csv_url) air_df_path = pathlib.Path(os.getenv('ED_OUTPUT_PATH'), 'tools', "stats", 'data', 'air_df.csv') # make the required path/directories pathlib.Path.resolve(air_df_path).parent.mkdir(parents=True, exist_ok=True) # write the downloded file to disk with open(air_df_path, 'wb') as air_df_file: air_df_file.write(req.content) self.air_out_df = pd.read_csv(air_df_path, header=0) except Exception as e: logger.error('Could not load the AIR CSV.')
def parse(res): """ function parses content to create a dataset model or return None if no resource in content""" if '/print/' in res.url: return None logger.debug(f'{res.url}') soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') publisher = Publisher() publisher['name'] = 'edgov' publisher['subOrganizationOf'] = None # check if the content contains any of the extensions if soup_parser.body.find(name='a', href=base_parser.resource_checker, recursive=True) is None: # no resource on this page, so return None return None # if code gets here, at least one resource was found # check if the parser is working on EDGOV web page if soup_parser.body.find(name='div', recursive=True) is not None: # parse the page with the parser and return result return parsers.parser1.parse(res, publisher) else: return None
def _generate_scraper_outputs_df(self, use_dump=False): def abs_url(url, source_url): if url.startswith( ('../', './', '/')) or not urllib.parse.urlparse(url).scheme: full_url = urllib.parse.urljoin(source_url, url) return full_url else: return url if self.deduplicated_list_path is None: files = traverse_output() else: try: with open(self.deduplicated_list_path, 'r') as fp: files = [pathlib.Path(line.rstrip()) for line in fp] except: files = traverse_output() df_dump = str( pathlib.Path( os.path.join(os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'))) if use_dump: df = pd.read_csv(df_dump) else: dfs = [] for fp in files: with open(fp, 'r') as json_file: try: j = json.load(json_file) # if it's marked for removal by the sanitizer, skip it if j.get('_clean_data', dict()).get('_remove_dataset'): logger.debug(f"Ignoring {j.get('source_url')}") continue j = [{ 'url': abs_url(r['url'], r['source_url']), 'source_url': r['source_url'], 'publisher': str(j['publisher']), 'size': r.get('headers', dict()).get('content-length', 0), 'scraper': fp.parent.name } for r in j['resources'] if r['source_url'].find('/print/') == -1] dfs.append(pd.read_json(json.dumps(j))) except Exception as e: logger.warning( f'Could not parse file {json_file} as JSON! {e}') df = pd.concat(dfs, ignore_index=True) df.to_csv(df_dump, index=False) return df
def process_item(self, dataset, spider): slug = slugify('-'.join(dataset['source_url'].split('/')[3:]))[:100] # restrict slug to 100 characters hashed_url = hashlib.md5(dataset['source_url'].encode('utf-8')).hexdigest() hashed_name = hashlib.md5(dataset['name'].encode('utf-8')).hexdigest() file_name = f"{slug}-{hashed_url}-{hashed_name}.json" file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{file_name}" self._log(dataset) logger.debug(f"Dumping to {file_path}") with open(file_path, 'w') as output: output.write(dataset.toJSON())
def __init__(self, delete_all_stats=False): logger.debug("Creating statistics...") if delete_all_stats is True: if os.path.exists( self.METRICS_OUTPUT_XLSX): # check if excel sheet exist os.remove(self.METRICS_OUTPUT_XLSX) # remove the excel sheet if os.path.exists(os.getenv('ED_OUTPUT_PATH') +\ '/transformers/deduplicate/deduplicated_all.lst'): self.deduplicated_list_path = os.getenv('ED_OUTPUT_PATH') +\ '/transformers/deduplicate/deduplicated_all.lst' else: self.deduplicated_list_path = None self.datopian_out_df = self._generate_datopian_df(use_dump=False) # self.resource_count_per_page = self.list_resource_count_per_page() self.resource_count_per_domain = self.list_resource_count_per_domain() self.page_count_per_domain = self.list_page_count_per_domain()
def generate_statistics(self): logger.debug("Creating statistics...") scraper_outputs_df = self._generate_scraper_outputs_df(use_dump=False) self.resource_count_per_page = self.list_resource_count_per_page( scraper_outputs_df) self.resource_count_per_domain = self.list_resource_count_per_domain( scraper_outputs_df) self.page_count_per_domain = self.list_page_count_per_domain( scraper_outputs_df) self.datasets_per_scraper = self.list_datasets_per_scraper() print( f"Total number of raw datasets: \n {self.datasets_per_scraper}\n", f"\n---\n\n", f"Total number of pages: {self.page_count_per_domain['page count'].sum()}\n", f"\n---\n\n", f"Total number of resources: {self.resource_count_per_domain['resource count'].sum()}\n", f"\n---\n\n", f"Total number of pages by domain: \n{self.page_count_per_domain}\n", f"\n---\n\n", f"Total number of resources by domain: \n{self.resource_count_per_domain}\n", f"\n---\n\n", )
def _log(self, d): logger.info("==================================================================================================") logger.success(f"{d['source_url']}") logger.info(f"Title: {d['title']}") logger.debug(f"Description: {d['notes']}") logger.debug(f"Name:{d['name']}") logger.info(f"Resources: {len(d['resources'])}") for r in d['resources']: logger.debug(f"\t{r['url']} > {r['name']}")
def process_item(self, dataset, spider): slug = slugify('-'.join(dataset['source_url'].split('/') [3:]))[:100] # restrict slug to 100 characters hashed_url = hashlib.md5( dataset['source_url'].encode('utf-8')).hexdigest() hashed_name = hashlib.md5(dataset['name'].encode('utf-8')).hexdigest() file_name = f"{slug}-{hashed_url}-{hashed_name}.json" if dataset.get('publisher') and (spider.name == 'edgov' or spider.name == 'sites'): try: name = dataset['publisher'].get('name', '') except: name = dataset['publisher'] Path(f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{name}" ).mkdir(parents=True, exist_ok=True) file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{name}/{file_name}" else: if spider.name in [ 'oese', 'osers', 'oela', 'octae', 'ope', 'opepd' ]: Path( f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/edgov/{spider.name}" ).mkdir(parents=True, exist_ok=True) file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/edgov/{spider.name}/{file_name}" else: file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{file_name}" self._log(dataset) logger.debug(f"Dumping to {file_path}") with open(file_path, 'w') as output: output.write(dataset.toJSON()) # add this attribute so that the saved (relative) location of datasets can be tracked dataset['saved_as_file'] = file_path[file_path.find("/scrapers/") + 1:] return dataset # return the dataset
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warn( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + name datasets_number = 0 resources_number = 0 for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{name}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{name}.data.json')
def transform(name=None, input_file=None): print(name) file_list = [] # holds the list of files which contain datajson/dataset datasets_list = [] # holds the data jsons gotten from files if name: # name of processed datajson is provided so get the file path file_list.append(Path(h.get_output_path('datajson'), f'{name}.data.json')) else: # name of processed datajson not provided file_list.extend(Path(h.get_output_path('datajson')).glob('*.json')) # read the contents in the file_list for file_path in file_list: df = pd.DataFrame(columns=[ 'title', 'description', 'tags', 'modified' 'publisher', 'source_url', 'data_steward_email', 'name', 'access_level', 'bureauCode', 'programCode', 'license', 'spatial', 'categories', 'level_of_data' ]) if name: sheet_name = name else: sheet_name = file_path.name.split('.')[0].upper() # read json from file using helper function data = h.read_file(file_path) for dd in data.get('dataset', []): # loop through the datasets contained in data dfd = { 'name': dd.get('identifier', None), 'title': dd.get('title', None), 'description': dd.get('description', None), 'tags': ', '.join(dd['keyword']), 'modified': dd.get('modified', None), 'publisher': dd['publisher']['name'], 'source_url': dd['scraped_from'], 'data_steward_email': dd['contactPoint']['hasEmail'], 'access_level': dd.get('accessLevel', None), 'bureauCode': ', '.join(dd.get('bureauCode', [])), 'programCode': ', '.join(dd.get('programCode', [])), 'license': dd.get('license', None), 'spatial': dd.get('spatial', None), 'categories': ', '.join(dd.get('theme', [])), 'level_of_data': ', '.join(dd.get('levelOfData', [])), } # if df is None: # # On first run, initialize the datframe with the datajson structure # # TODO: Remove this hack, maybe, sometimes # df = pd.DataFrame(columns=dataset_dict.keys()) # datasets_list.append(dataset_dict) # print(dataset_dict['title']) df2 = pd.DataFrame([dfd.values()], columns=dfd.keys()) # print(df2) logger.debug(f"Dumping data for [{sheet_name}] {dd['identifier']}") df = df.append(df2, ignore_index=True) logger.debug(f"Dumping data for {file_path}") _add_to_spreadsheet(os.path.join(OUTPUT_DIR, 'datasets.xlsx'), sheet_name, df)
def transform(name, input_file=None): if input_file is None: file_list = traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = traverse_output(name) logger.debug(f'{len(file_list)} files to transform.') catalog = Catalog() catalog.catalog_id = "datopian_data_json_" + (name or 'all') # keep track/stata for item transformed datasets_number = 0 resources_number = 0 sources_number = 0 collections_number = 0 # loop through the list of filepaths to be transformed for file_path in file_list: data = read_file(file_path) if not data: continue dataset = _transform_scraped_dataset(data, name) if not dataset: # no dataset was returned (i.e. dataset probably marked for removal) continue catalog.datasets.append(dataset) datasets_number += 1 resources_number += len(dataset.distribution) # TODO WORK FROM BELOW HERE # get the list of Sources for this catalog catalog_sources = list() try: # read the list of preprocessed (but still 'raw') Sources from file catalog_sources = read_file( f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json") # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object catalog_sources = _transform_preprocessed_sources(catalog_sources) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field' ) # add the list of Source objects to the catalog catalog.sources = catalog_sources or [] # update the number fo transformed Sources sources_number = len(catalog_sources or []) # get the list of Collections for this catalog catalog_collections = list() try: # read the list of preprocessed (but still 'raw') Collections from file catalog_collections = read_file( f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json" ) # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object catalog_collections = _transform_preprocessed_collections( catalog_collections) except: logger.warning( f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field' ) # add the list of Collection objects to the catalog catalog.collections = catalog_collections or [] # update the number fo transformed Collections collections_number = len(catalog_collections or []) # validate the catalog object if not catalog.validate_catalog(pls_fix=True): logger.error(f"catalog validation Failed! Ending transform process") return logger.debug('{} Sources transformed.'.format(sources_number)) logger.debug('{} Collections transformed.'.format(collections_number)) logger.debug('{} datasets transformed.'.format(datasets_number)) logger.debug('{} resources transformed.'.format(resources_number)) output_path = h.get_output_path('datajson') file_path = os.path.join(output_path, f'{(name or "all")}.data.json') with open(file_path, 'w') as output: output.write(catalog.dump()) logger.debug(f'Output file: {file_path}') h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')