Пример #1
0
def add_sources_to_collections_json(name,
                                    graph=GraphWrapper.graph,
                                    output_dir=OUTPUT_DIR):
    """ function writes the sources which have been identified in `graph`
    to their associated collections.json and raw dataset json files. 
    This function updates the json files by
    adding a `source` field to the `collection` key within the json structure """

    # get the collection datajson
    collections_json = h.read_file(
        Path(output_dir, 'transformers/collections',
             f'{name}.collections.json'))

    with graph.graph_lock:
        # select all collection vertices within the graph
        collection_vertex_seq = graph.vs.select(is_collection_eq=True,
                                                name_ne='base_vertex')
        for collection in collection_vertex_seq:
            # get the list of collections within the collections.json that matches this collection vertex
            collection_json_list = list(
                filter(lambda collection_obj, compare_collection_id=collection[
                    'collection_id']: collection_obj['collection_id'] ==
                       compare_collection_id,
                       collections_json))
            # if no collection returned from the datajson, skip this collection vertex
            if len(collection_json_list) == 0:
                continue
            # assign the source info from the collection vertex to the collection json
            collection_json_list[0]['source'] = collection['in_source']
        # write the updated collection datajson back to file
        h.write_file(
            Path(output_dir, 'transformers/collections',
                 f'{name}.collections.json'), collections_json)

        # update the source info for each raw dataset i.e. each dataset json file
        # select the dataset vertices from the graph
        try:
            dataset_vertex_seq = graph.vs.select(is_dataset_eq=True,
                                                 name_ne='base_vertex',
                                                 in_collection_ne=None)
        except:
            dataset_vertex_seq = []

        for dataset_vertex in dataset_vertex_seq:
            # read the raw dataset
            data = h.read_file(Path(output_dir, dataset_vertex['name']))
            if not data:
                continue
            # update the raw dataset collection field & source sub-field
            data['collection'] = dataset_vertex['in_collection']
            # write the updated raw dataset back to file
            h.write_file(Path(output_dir, dataset_vertex['name']), data)
Пример #2
0
def add_collections_to_raw_datasets(graph=GraphWrapper.graph, 
                                    output_dir=OUTPUT_DIR):
    """ function writes the collections which have been identified in `graph`
    to their associated raw dataset json file. 
    This function updates the raw dataset json files by
    adding a `collection` field to the json structure """
    
    with graph.graph_lock:                         
        # select the dataset vertices from the graph
        try:
            dataset_vertex_seq = graph.vs.select(is_dataset_eq=True, 
                                             name_ne='base_vertex',
                                             in_collection_ne=None)
        except:
            dataset_vertex_seq = []
                                 
        for dataset in dataset_vertex_seq:
            # read the raw dataset
            data = h.read_file(Path(output_dir, dataset['name']))
            if not data:
                continue
            # update the raw dataset
            data['collection'] = dataset['in_collection']
            # write the updated raw dataset back to file
            h.write_file(Path(output_dir, dataset['name']), data)
Пример #3
0
def transform(name=None, input_file=None):

    if input_file is None:
        file_list = h.traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = h.traverse_output(name)

    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue
        # mark as private datasets that have certain keywords in their data
        data = _mark_private(data,
                             search_words=[
                                 'conference', 'awards', 'user guide',
                                 'applications'
                             ])

        # mark of removal datasets that have certain keywords
        data = _remove_dataset(
            data, search_words=['photo', 'foto', 'photos', 'fotos'])

        # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY
        # 1. remove 'table [0-9].' from beginning of dataset title
        data = _strip_unwanted_string(data,
                                      r'^table [0-9a-z]+(-?[a-z])?\.',
                                      dict_key='title')

        # set the 'level of data' for the dataset
        data = _set_dataset_level_of_data(data)

        # assign the dataset to groups
        # according to https://www2.ed.gov/rschstat/catalog/index.html
        data = _set_dataset_groups(data)

        # remove the old format for collections / sourcs
        data = _remove_old_sources_collections(data)

        # write modified dataset back to file
        h.write_file(file_path, data)
Пример #4
0
def transform(name=None, input_file=None):
    """
    function is responsible for transofrming raw datasets into Sources
    """

    if input_file is None:  # no input file specified
        file_list = h.traverse_output(
            name)  # run through all the files in 'name' directory
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = h.traverse_output(name)

    sources_list = [
    ]  # holds the list of sources acquired from 'name' scraper directory
    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue

        # retrieve source from dataset
        source = extract_source_from(dataset=data, use_key='collection')
        if not source:  # source could not be retrieved
            continue
        # add source to list
        sources_list.append(source)

    # get a list of non-duplicate Sources
    sources_list = get_distinct_sources_from(sources_list,
                                             min_occurence_counter=2)
    # get the path were the gotten Sources will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.sources.json'
    # write to file the Sources gotten from 'name' scraped output
    h.write_file(file_output_path, sources_list)
    # write file the Sources gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path,
                                 f'{(name or "all")}.sources.json')
Пример #5
0
def transform(name, input_file=None):
    if input_file is None:
        file_list = traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warn(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = traverse_output(name)

    logger.debug(f'{len(file_list)} files to transform.')

    catalog = Catalog()
    catalog.catalog_id = "datopian_data_json_" + name

    datasets_number = 0
    resources_number = 0

    for file_path in file_list:

        data = read_file(file_path)
        if not data:
            continue

        dataset = _transform_scraped_dataset(data, name)
        catalog.datasets.append(dataset)

        datasets_number += 1
        resources_number += len(dataset.distribution)

    logger.debug('{} datasets transformed.'.format(datasets_number))
    logger.debug('{} resources transformed.'.format(resources_number))

    output_path = h.get_output_path('datajson')
    file_path = os.path.join(output_path, f'{name}.data.json')
    with open(file_path, 'w') as output:
        output.write(catalog.dump())
        logger.debug(f'Output file: {file_path}')

    h.upload_to_s3_if_configured(file_path, f'{name}.data.json')
Пример #6
0
def transform(name=None,
              input_file=None,
              use_raw_datasets=False) -> pd.DataFrame:
    """ function transforms the datajson/datasets into
    a dataframe/csv containig data to be used for RAG analyses on
    the efficacy of the scraping toolkit to get viable/usable structured data from
    the unstructured data source.
    
    The function by default operates on/utilises datajson i.e.
    the json that is ready to be ingested by the ckan harvester;
    However, setting 'use_raw_datasets' to True means the function will
    operate on the raw, parsed data which was scraped from the data source.

    PARAMETERS
    - name: if provided must correspond to the name of a scraper.
    if 'use_raw_datasets' is False, file with the format '<name>.data.json'
    will be located in the datajson subdirectory of 'ED_OUTPUT_PATH/transformers'
    and read.
    if 'use_raw_datasets' is True, dataset files contained in the 'name'
    scrapers subdirectory of the 'ED_OUTPUT_PATH/scrapers' will be read
    
    input_file: if provided mut be a file with list of datajson or dataset files
    to read.

    If no parameters are provided, which is the default behaviour;
    then all datajson files contained in datajson subdirectory of
    'ED_OUTPUT_PATH/transformers' will be read.

    function returns the DataFrame containing the transformed datajson/dataset files
    """

    file_list = []  # holds the list of files which contain datajson/dataset
    datasets_list = []  # holds the data jsons gotten from files

    if use_raw_datasets == True:  # work on raw datasets
        if not input_file:  # no input file provided
            # loop over directory structure
            if name:
                # loop over <name> scraper output e.g nces
                file_list = h.traverse_output(name)
                # datasets = list of all <name> files
            else:
                # loop over everything
                file_list = h.traverse_output(None)
                # datasets = list of all JSON files
        else:  # input file provided
            # read input_file, which is a list of files
            with open(input_file, 'r') as fp:
                try:
                    file_list = [line.rstrip() for line in fp]
                except Exception:
                    logger.warning(
                        f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
                    )
                    file_list = h.traverse_output(None)

    else:  # work with processed/transformed datajson
        if not input_file:  # no input file provided
            if name:  # name of processed datajson is provided so get the file path
                file_list.append(
                    Path(h.get_output_path('datajson'), f'{name}.data.json'))
            else:  # name of processed datajson not provided
                file_list.extend(
                    Path(h.get_output_path('datajson')).glob('*.json'))
        else:  # input file provided
            # read input_file, which is a list of files
            with open(input_file, 'r') as fp:
                try:
                    file_list = [line.rstrip() for line in fp]
                except Exception:
                    logger.warning(
                        f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
                    )
                    file_list.extend(
                        Path(h.get_output_path('datajson')).glob('*.json'))

    if use_raw_datasets == True:  # work on raw datasets
        # read the contents in file_list
        for file_path in file_list:
            # read json from the file using helper
            data = h.read_file(file_path)
            # compute the weight score of the dataset
            compute_score(data, append_score=True, use_raw_datasets=True)
            datasets_list.append(data)
    else:  # work with processed json data
        # read the contents in the file_list
        for file_path in file_list:
            # read json from file using helper function
            data = h.read_file(file_path)
            for dataset_dict in data.get(
                    'dataset',
                []):  # loop through the datasets contained in data
                # compute the weighted score of the dataset
                compute_score(dataset_dict,
                              append_score=True,
                              use_raw_datasets=False)
                datasets_list.append(dataset_dict)

    if use_raw_datasets == True:  # work on raw datasets
        # map the datasets to pandas format
        dataset_rows_list = map(lambda dataset: [dataset.get('publisher'),\
                                                dataset.get('source_url'), \
                                                dataset.get('_weighted_score'), \
                                                dataset.get('_weighted_score_ratio')],
                                datasets_list)
    else:  # work on processed datajson
        # map the dataset to pandas format
        dataset_rows_list = map(lambda dataset: [dataset.get('publisher')['name'],\
                                                dataset.get('scraped_from'), \
                                                dataset.get('_weighted_score'), \
                                                dataset.get('_weighted_score_ratio')],
                                datasets_list)
    # create the pandas df
    weighted_datasets_scores_df = pd.DataFrame(dataset_rows_list,
                                               columns=[
                                                   'publisher', 'source url',
                                                   'weighted score',
                                                   'weighted score ratio'
                                               ])

    # create a df that incorporates domain info
    weighted_datasets_scores_df2 = pd.DataFrame(columns=['domain'])
    weighted_datasets_scores_df2['domain'] = weighted_datasets_scores_df.\
            apply(lambda row: urllib.parse.\
                    urlparse(row['source url']).hostname.\
                        replace('www2.', 'www.').replace('www.', ''), axis=1)

    weighted_datasets_scores_df2['publisher'] = weighted_datasets_scores_df[
        'publisher']
    weighted_datasets_scores_df2['source url'] = weighted_datasets_scores_df[
        'source url']
    weighted_datasets_scores_df2[
        'weighted score'] = weighted_datasets_scores_df['weighted score']
    weighted_datasets_scores_df2[
        'weighted score ratio'] = weighted_datasets_scores_df[
            'weighted score ratio']

    # create the output csv file name

    output_dated_dir = os.path.join(
        OUTPUT_DIR, f'{dt.now().year}-{dt.now().month}-{dt.now().day}')
    Path(output_dated_dir).mkdir(parents=True, exist_ok=True)

    if use_raw_datasets == True:  # use raw datasets
        output_filename = "datasets_weighted_scores_{}_raw.csv".format(
            name or "all")
    else:  # use processed datajson
        output_filename = "datasets_weighted_scores_{}.csv".format(name
                                                                   or "all")

    # create the fullpath weer file will be written
    fullpath = os.path.join(OUTPUT_DIR, output_filename)

    # write the dataframe to csv
    weighted_datasets_scores_df2.to_csv(fullpath, index=False)
    weighted_datasets_scores_df2.to_csv(os.path.join(output_dated_dir,
                                                     output_filename),
                                        index=False)
    # write the csv to S3 bucket
    h.upload_to_s3_if_configured(fullpath, f'{output_filename}')

    return weighted_datasets_scores_df2
Пример #7
0
def transform(name=None, input_file=None):

    print(name)
    file_list = [] # holds the list of files which contain datajson/dataset
    datasets_list = [] # holds the data jsons gotten from files

    if name: # name of processed datajson is provided so get the file path
        file_list.append(Path(h.get_output_path('datajson'), f'{name}.data.json'))
    else: # name of processed datajson not provided
        file_list.extend(Path(h.get_output_path('datajson')).glob('*.json'))

    # read the contents in the file_list
    for file_path in file_list:

        df = pd.DataFrame(columns=[
            'title',
            'description',
            'tags',
            'modified'
            'publisher',
            'source_url',
            'data_steward_email',
            'name',
            'access_level',
            'bureauCode',
            'programCode',
            'license',
            'spatial',
            'categories',
            'level_of_data'
        ])

        if name:
            sheet_name = name
        else:
            sheet_name = file_path.name.split('.')[0].upper()

        # read json from file using helper function
        data = h.read_file(file_path)
        for dd in data.get('dataset', []): # loop through the datasets contained in data

            dfd = {
                'name': dd.get('identifier', None),
                'title': dd.get('title', None),
                'description': dd.get('description', None),
                'tags': ', '.join(dd['keyword']),
                'modified': dd.get('modified', None),
                'publisher': dd['publisher']['name'],
                'source_url': dd['scraped_from'],
                'data_steward_email': dd['contactPoint']['hasEmail'],
                'access_level': dd.get('accessLevel', None),
                'bureauCode': ', '.join(dd.get('bureauCode', [])),
                'programCode': ', '.join(dd.get('programCode', [])),
                'license': dd.get('license', None),
                'spatial': dd.get('spatial', None),
                'categories': ', '.join(dd.get('theme', [])),
                'level_of_data': ', '.join(dd.get('levelOfData', [])),
            }


            # if df is None:
            #     # On first run, initialize the datframe with the datajson structure
            #     # TODO: Remove this hack, maybe, sometimes
            #     df = pd.DataFrame(columns=dataset_dict.keys())

            # datasets_list.append(dataset_dict)
            # print(dataset_dict['title'])
            df2 = pd.DataFrame([dfd.values()], columns=dfd.keys())
            # print(df2)
            logger.debug(f"Dumping data for [{sheet_name}] {dd['identifier']}")
            df = df.append(df2, ignore_index=True)

        logger.debug(f"Dumping data for {file_path}")
        _add_to_spreadsheet(os.path.join(OUTPUT_DIR, 'datasets.xlsx'), sheet_name, df)
Пример #8
0
def transform(name=None, input_file=None):

    if input_file is None:
        file_list = h.traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(f'Cannot read from list of output files at {input_file}, falling back to all collected data!')
            file_list = h.traverse_output(name)
    
    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue

        # skip the dataset that has only txt resources
        if _dataset_only_has_txt_resources(data):
            clean_data = {}
            clean_data['_remove_dataset'] = True # mark dataset for removal
            data['_clean_data'] = clean_data # update dataset

        # Remove datasets with no resources or no relevant resources
        if not len(_filter_resources_list(data['resources'])) or not len(data['resources']):
            clean_data = {}
            clean_data['_remove_dataset'] = True # mark dataset for removal
            data['_clean_data'] = clean_data # update dataset

        # Special hacks for ed.gov data
        if name == 'edgov':
            clean_data = {}
            clean_data['_remove_dataset'] = False # unmark dataset for removal

            # # Get the publisher name
            # try:
            #     publisher_name = data['publisher'].get('name')
            # except:
            #     publisher_name = data['publisher']

            # Check for "bad" URLs and remove them
            bad_subdomains = ['dashboard', 'rems']
            if any([f'{bs}.ed.gov' in data['source_url'] for bs in bad_subdomains]):
                clean_data['_remove_dataset'] = True # mark dataset for removal

            data['_clean_data'] = clean_data # update dataset

        # OESE hack. Remove datasets outside oese.ed.gov domain
        publisher = data.get('publisher')
        publisher_name = ""

        if type(publisher) == dict:
            publisher_name = publisher.get('name')
        elif type(publisher) == str:
            publisher_name = publisher

        if  publisher_name in ['oese',
                    'Office of Elementary and Secondary Education',
                    'Office of Elementary and Secondary Education (OESE)']:
            if _dataset_outside_oese_domain(data):
                clean_data = {}
                clean_data['_remove_dataset'] = True # mark dataset for removal
                data['_clean_data'] = clean_data # update dataset

        # Remove duplicate identifiers generated by duplicate URLs in IES/NCES
        if  publisher_name in ['ies',
                               'Institute of Education Sciences (IES)',
                               'National Center for Education Statistics (NCES)',
                               'nces']:
            if data.get('source_url').endswith('current=yes'):
                clean_data = data['_clean_data']
                clean_data['_remove_dataset'] = True # mark dataset for removal
                data['_clean_data'] = clean_data # update dataset

        # Filter resources
        data = _filter_dataset_resources(data)

        # mark as private datasets that have certain keywords in their data
        data = _mark_private(data, search_words=['conference', 'awards',
                                                 'user guide', 'applications'])

        # mark of removal datasets that have certain keywords
        data = _remove_dataset(data, search_words=['photo', 'foto', 'photos', 'fotos'])

        # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY
        # 1. remove 'table [0-9].' from beginning of dataset title
        data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.',
                                      dict_key='title')

        # set the 'level of data' for the dataset
        data = _set_dataset_level_of_data(data)

        # assign the dataset to groups
        # according to https://www2.ed.gov/rschstat/catalog/index.html
        data = _set_dataset_groups(data)
      
        # remove the old format for collections / sourcs
        data = _remove_old_sources_collections(data)
        
        # write modified dataset back to file
        h.write_file(file_path, data)
Пример #9
0
def transform(name, input_file=None):
    if input_file is None:
        file_list = traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = traverse_output(name)

    logger.debug(f'{len(file_list)} files to transform.')

    catalog = Catalog()
    catalog.catalog_id = "datopian_data_json_" + (name or 'all')

    # keep track/stata for item transformed
    datasets_number = 0
    resources_number = 0
    sources_number = 0
    collections_number = 0

    # loop through the list of filepaths to be transformed
    for file_path in file_list:

        data = read_file(file_path)
        if not data:
            continue

        dataset = _transform_scraped_dataset(data, name)

        if not dataset:  # no dataset was returned (i.e. dataset probably marked for removal)
            continue

        catalog.datasets.append(dataset)

        datasets_number += 1
        resources_number += len(dataset.distribution)

    # TODO WORK FROM BELOW HERE
    # get the list of Sources for this catalog
    catalog_sources = list()
    try:
        # read the list of preprocessed (but still 'raw') Sources from file
        catalog_sources = read_file(
            f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json")
        # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object
        catalog_sources = _transform_preprocessed_sources(catalog_sources)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field'
        )

    # add the list of Source objects to the catalog
    catalog.sources = catalog_sources or []
    # update the number fo transformed Sources
    sources_number = len(catalog_sources or [])

    # get the list of Collections for this catalog
    catalog_collections = list()
    try:
        # read the list of preprocessed (but still 'raw') Collections from file
        catalog_collections = read_file(
            f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json"
        )
        # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object
        catalog_collections = _transform_preprocessed_collections(
            catalog_collections)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field'
        )

    # add the list of Collection objects to the catalog
    catalog.collections = catalog_collections or []
    # update the number fo transformed Collections
    collections_number = len(catalog_collections or [])

    # validate the catalog object
    if not catalog.validate_catalog(pls_fix=True):
        logger.error(f"catalog validation Failed! Ending transform process")
        return

    logger.debug('{} Sources transformed.'.format(sources_number))
    logger.debug('{} Collections transformed.'.format(collections_number))
    logger.debug('{} datasets transformed.'.format(datasets_number))
    logger.debug('{} resources transformed.'.format(resources_number))

    output_path = h.get_output_path('datajson')
    file_path = os.path.join(output_path, f'{(name or "all")}.data.json')
    with open(file_path, 'w') as output:
        output.write(catalog.dump())
        logger.debug(f'Output file: {file_path}')

    h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')