예제 #1
0
    def __init__(self):

        logger.debug("Creating statistics...")

        if os.path.exists(
                self.METRICS_OUTPUT_XLSX):  # check if excel sheet exist
            os.remove(self.METRICS_OUTPUT_XLSX)  # remove the excel sheet

        try:
            self.datopian_out_df = pd.read_csv(os.path.join(
                os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'),
                                               header=0)
        except Exception as e:
            logger.error(
                'Could not load the Datopian CSV, please generate it first.')
            # read the AIR csv into a dataframe

        try:
            air_csv_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/AIR.csv?alt=media'
            req = requests.get(air_csv_url)
            air_df_path = pathlib.Path(os.getenv('ED_OUTPUT_PATH'), 'tools',
                                       "stats", 'data', 'air_df.csv')
            # make the required path/directories
            pathlib.Path.resolve(air_df_path).parent.mkdir(parents=True,
                                                           exist_ok=True)
            # write the downloded file to disk
            with open(air_df_path, 'wb') as air_df_file:
                air_df_file.write(req.content)

            self.air_out_df = pd.read_csv(air_df_path, header=0)
        except Exception as e:
            logger.error('Could not load the AIR CSV.')
예제 #2
0
def parse(res):
    """ function parses content to create a dataset model
    or return None if no resource in content"""

    if '/print/' in res.url:
        return None

    logger.debug(f'{res.url}')

    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    publisher = Publisher()
    publisher['name'] = 'edgov'
    publisher['subOrganizationOf'] = None

    # check if the content contains any of the extensions
    if soup_parser.body.find(name='a',
                             href=base_parser.resource_checker,
                             recursive=True) is None:
        # no resource on this page, so return None
        return None
    # if code gets here, at least one resource was found

    # check if the parser is working on EDGOV web page
    if soup_parser.body.find(name='div', recursive=True) is not None:
        # parse the page with the parser and return result
        return parsers.parser1.parse(res, publisher)
    else:
        return None
예제 #3
0
    def _generate_scraper_outputs_df(self, use_dump=False):
        def abs_url(url, source_url):
            if url.startswith(
                ('../', './', '/')) or not urllib.parse.urlparse(url).scheme:
                full_url = urllib.parse.urljoin(source_url, url)
                return full_url
            else:
                return url

        if self.deduplicated_list_path is None:
            files = traverse_output()
        else:
            try:
                with open(self.deduplicated_list_path, 'r') as fp:
                    files = [pathlib.Path(line.rstrip()) for line in fp]
            except:
                files = traverse_output()

        df_dump = str(
            pathlib.Path(
                os.path.join(os.getenv('ED_OUTPUT_PATH'), 'out_df.csv')))
        if use_dump:
            df = pd.read_csv(df_dump)
        else:
            dfs = []
            for fp in files:
                with open(fp, 'r') as json_file:
                    try:
                        j = json.load(json_file)

                        # if it's marked for removal by the sanitizer, skip it
                        if j.get('_clean_data', dict()).get('_remove_dataset'):
                            logger.debug(f"Ignoring {j.get('source_url')}")
                            continue

                        j = [{
                            'url':
                            abs_url(r['url'], r['source_url']),
                            'source_url':
                            r['source_url'],
                            'publisher':
                            str(j['publisher']),
                            'size':
                            r.get('headers', dict()).get('content-length', 0),
                            'scraper':
                            fp.parent.name
                        } for r in j['resources']
                             if r['source_url'].find('/print/') == -1]

                        dfs.append(pd.read_json(json.dumps(j)))

                    except Exception as e:
                        logger.warning(
                            f'Could not parse file {json_file} as JSON! {e}')
            df = pd.concat(dfs, ignore_index=True)
            df.to_csv(df_dump, index=False)

        return df
예제 #4
0
    def process_item(self, dataset, spider):

        slug = slugify('-'.join(dataset['source_url'].split('/')[3:]))[:100] # restrict slug to 100 characters
        hashed_url = hashlib.md5(dataset['source_url'].encode('utf-8')).hexdigest()
        hashed_name = hashlib.md5(dataset['name'].encode('utf-8')).hexdigest()
        file_name = f"{slug}-{hashed_url}-{hashed_name}.json"
        file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{file_name}"
        self._log(dataset)
        logger.debug(f"Dumping to {file_path}")
        with open(file_path, 'w') as output:
            output.write(dataset.toJSON())
예제 #5
0
    def __init__(self, delete_all_stats=False):

        logger.debug("Creating statistics...")
        if delete_all_stats is True:
            if os.path.exists(
                    self.METRICS_OUTPUT_XLSX):  # check if excel sheet exist
                os.remove(self.METRICS_OUTPUT_XLSX)  # remove the excel sheet

        if os.path.exists(os.getenv('ED_OUTPUT_PATH') +\
            '/transformers/deduplicate/deduplicated_all.lst'):

            self.deduplicated_list_path = os.getenv('ED_OUTPUT_PATH') +\
            '/transformers/deduplicate/deduplicated_all.lst'
        else:
            self.deduplicated_list_path = None

        self.datopian_out_df = self._generate_datopian_df(use_dump=False)
        # self.resource_count_per_page = self.list_resource_count_per_page()
        self.resource_count_per_domain = self.list_resource_count_per_domain()
        self.page_count_per_domain = self.list_page_count_per_domain()
예제 #6
0
    def generate_statistics(self):
        logger.debug("Creating statistics...")
        scraper_outputs_df = self._generate_scraper_outputs_df(use_dump=False)
        self.resource_count_per_page = self.list_resource_count_per_page(
            scraper_outputs_df)
        self.resource_count_per_domain = self.list_resource_count_per_domain(
            scraper_outputs_df)
        self.page_count_per_domain = self.list_page_count_per_domain(
            scraper_outputs_df)
        self.datasets_per_scraper = self.list_datasets_per_scraper()

        print(
            f"Total number of raw datasets: \n {self.datasets_per_scraper}\n",
            f"\n---\n\n",
            f"Total number of pages: {self.page_count_per_domain['page count'].sum()}\n",
            f"\n---\n\n",
            f"Total number of resources: {self.resource_count_per_domain['resource count'].sum()}\n",
            f"\n---\n\n",
            f"Total number of pages by domain: \n{self.page_count_per_domain}\n",
            f"\n---\n\n",
            f"Total number of resources by domain: \n{self.resource_count_per_domain}\n",
            f"\n---\n\n",
        )
예제 #7
0
 def _log(self, d):
     logger.info("==================================================================================================")
     logger.success(f"{d['source_url']}")
     logger.info(f"Title: {d['title']}")
     logger.debug(f"Description: {d['notes']}")
     logger.debug(f"Name:{d['name']}")
     logger.info(f"Resources: {len(d['resources'])}")
     for r in d['resources']:
         logger.debug(f"\t{r['url']} > {r['name']}")
예제 #8
0
    def process_item(self, dataset, spider):

        slug = slugify('-'.join(dataset['source_url'].split('/')
                                [3:]))[:100]  # restrict slug to 100 characters
        hashed_url = hashlib.md5(
            dataset['source_url'].encode('utf-8')).hexdigest()
        hashed_name = hashlib.md5(dataset['name'].encode('utf-8')).hexdigest()
        file_name = f"{slug}-{hashed_url}-{hashed_name}.json"
        if dataset.get('publisher') and (spider.name == 'edgov'
                                         or spider.name == 'sites'):
            try:
                name = dataset['publisher'].get('name', '')
            except:
                name = dataset['publisher']
            Path(f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{name}"
                 ).mkdir(parents=True, exist_ok=True)
            file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{name}/{file_name}"
        else:
            if spider.name in [
                    'oese', 'osers', 'oela', 'octae', 'ope', 'opepd'
            ]:
                Path(
                    f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/edgov/{spider.name}"
                ).mkdir(parents=True, exist_ok=True)
                file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/edgov/{spider.name}/{file_name}"
            else:
                file_path = f"{os.getenv('ED_OUTPUT_PATH')}/scrapers/{spider.name}/{file_name}"
        self._log(dataset)
        logger.debug(f"Dumping to {file_path}")
        with open(file_path, 'w') as output:
            output.write(dataset.toJSON())

        # add this attribute so that the saved (relative) location of datasets can be tracked
        dataset['saved_as_file'] = file_path[file_path.find("/scrapers/") + 1:]

        return dataset  # return the dataset
예제 #9
0
def transform(name, input_file=None):
    if input_file is None:
        file_list = traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warn(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = traverse_output(name)

    logger.debug(f'{len(file_list)} files to transform.')

    catalog = Catalog()
    catalog.catalog_id = "datopian_data_json_" + name

    datasets_number = 0
    resources_number = 0

    for file_path in file_list:

        data = read_file(file_path)
        if not data:
            continue

        dataset = _transform_scraped_dataset(data, name)
        catalog.datasets.append(dataset)

        datasets_number += 1
        resources_number += len(dataset.distribution)

    logger.debug('{} datasets transformed.'.format(datasets_number))
    logger.debug('{} resources transformed.'.format(resources_number))

    output_path = h.get_output_path('datajson')
    file_path = os.path.join(output_path, f'{name}.data.json')
    with open(file_path, 'w') as output:
        output.write(catalog.dump())
        logger.debug(f'Output file: {file_path}')

    h.upload_to_s3_if_configured(file_path, f'{name}.data.json')
예제 #10
0
def transform(name=None, input_file=None):

    print(name)
    file_list = [] # holds the list of files which contain datajson/dataset
    datasets_list = [] # holds the data jsons gotten from files

    if name: # name of processed datajson is provided so get the file path
        file_list.append(Path(h.get_output_path('datajson'), f'{name}.data.json'))
    else: # name of processed datajson not provided
        file_list.extend(Path(h.get_output_path('datajson')).glob('*.json'))

    # read the contents in the file_list
    for file_path in file_list:

        df = pd.DataFrame(columns=[
            'title',
            'description',
            'tags',
            'modified'
            'publisher',
            'source_url',
            'data_steward_email',
            'name',
            'access_level',
            'bureauCode',
            'programCode',
            'license',
            'spatial',
            'categories',
            'level_of_data'
        ])

        if name:
            sheet_name = name
        else:
            sheet_name = file_path.name.split('.')[0].upper()

        # read json from file using helper function
        data = h.read_file(file_path)
        for dd in data.get('dataset', []): # loop through the datasets contained in data

            dfd = {
                'name': dd.get('identifier', None),
                'title': dd.get('title', None),
                'description': dd.get('description', None),
                'tags': ', '.join(dd['keyword']),
                'modified': dd.get('modified', None),
                'publisher': dd['publisher']['name'],
                'source_url': dd['scraped_from'],
                'data_steward_email': dd['contactPoint']['hasEmail'],
                'access_level': dd.get('accessLevel', None),
                'bureauCode': ', '.join(dd.get('bureauCode', [])),
                'programCode': ', '.join(dd.get('programCode', [])),
                'license': dd.get('license', None),
                'spatial': dd.get('spatial', None),
                'categories': ', '.join(dd.get('theme', [])),
                'level_of_data': ', '.join(dd.get('levelOfData', [])),
            }


            # if df is None:
            #     # On first run, initialize the datframe with the datajson structure
            #     # TODO: Remove this hack, maybe, sometimes
            #     df = pd.DataFrame(columns=dataset_dict.keys())

            # datasets_list.append(dataset_dict)
            # print(dataset_dict['title'])
            df2 = pd.DataFrame([dfd.values()], columns=dfd.keys())
            # print(df2)
            logger.debug(f"Dumping data for [{sheet_name}] {dd['identifier']}")
            df = df.append(df2, ignore_index=True)

        logger.debug(f"Dumping data for {file_path}")
        _add_to_spreadsheet(os.path.join(OUTPUT_DIR, 'datasets.xlsx'), sheet_name, df)
예제 #11
0
def transform(name, input_file=None):
    if input_file is None:
        file_list = traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = traverse_output(name)

    logger.debug(f'{len(file_list)} files to transform.')

    catalog = Catalog()
    catalog.catalog_id = "datopian_data_json_" + (name or 'all')

    # keep track/stata for item transformed
    datasets_number = 0
    resources_number = 0
    sources_number = 0
    collections_number = 0

    # loop through the list of filepaths to be transformed
    for file_path in file_list:

        data = read_file(file_path)
        if not data:
            continue

        dataset = _transform_scraped_dataset(data, name)

        if not dataset:  # no dataset was returned (i.e. dataset probably marked for removal)
            continue

        catalog.datasets.append(dataset)

        datasets_number += 1
        resources_number += len(dataset.distribution)

    # TODO WORK FROM BELOW HERE
    # get the list of Sources for this catalog
    catalog_sources = list()
    try:
        # read the list of preprocessed (but still 'raw') Sources from file
        catalog_sources = read_file(
            f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json")
        # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object
        catalog_sources = _transform_preprocessed_sources(catalog_sources)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field'
        )

    # add the list of Source objects to the catalog
    catalog.sources = catalog_sources or []
    # update the number fo transformed Sources
    sources_number = len(catalog_sources or [])

    # get the list of Collections for this catalog
    catalog_collections = list()
    try:
        # read the list of preprocessed (but still 'raw') Collections from file
        catalog_collections = read_file(
            f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json"
        )
        # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object
        catalog_collections = _transform_preprocessed_collections(
            catalog_collections)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field'
        )

    # add the list of Collection objects to the catalog
    catalog.collections = catalog_collections or []
    # update the number fo transformed Collections
    collections_number = len(catalog_collections or [])

    # validate the catalog object
    if not catalog.validate_catalog(pls_fix=True):
        logger.error(f"catalog validation Failed! Ending transform process")
        return

    logger.debug('{} Sources transformed.'.format(sources_number))
    logger.debug('{} Collections transformed.'.format(collections_number))
    logger.debug('{} datasets transformed.'.format(datasets_number))
    logger.debug('{} resources transformed.'.format(resources_number))

    output_path = h.get_output_path('datajson')
    file_path = os.path.join(output_path, f'{(name or "all")}.data.json')
    with open(file_path, 'w') as output:
        output.write(catalog.dump())
        logger.debug(f'Output file: {file_path}')

    h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')