示例#1
0
def _mark_private(dataset: dict,
                  search_words=[],
                  add_word_tag=True,
                  mark_as_private=True) -> dict:
    """ private helper function.
    FUNCTION ONLY adds/modifies the '_clean_data' key of 'dataset' during operations.

    searches the dataset title for provided search_words.
    if any search_words are found, will OPTIONALLY marks the datasets as private and
    adds the search word as a tag to the list of keywords"""

    if dataset.get('title') is None:
        return dataset

    dataset['title'] = dataset['title'].strip()

    # get the '_clean_data' key of dataset
    clean_data = dataset.setdefault('_clean_data', {})
    # get the title key from clean_data or use those from dataset
    title = clean_data.get('title', dataset['title'].strip())
    # get the tags key from clean_data or use those from dataset
    tags = clean_data.get('tags', dataset.get('tags', '').strip())

    for word in search_words:
        # check if word is in dataset title
        if re.search('\\b' + re.escape(word) + '\\b', title, re.IGNORECASE):
            # word is in title
            if add_word_tag and word.lower().replace(' ', '-')\
                not in h.transform_keywords(tags):
                # word needs to be added to tag
                tags = h.transform_keywords(tags)
                tags.append(word.lower().replace(' ', '-'))
                tags = ';'.join(tags)
                clean_data['tags'] = tags
            # mark dataset as 'private;
            if mark_as_private:
                clean_data['accessLevel'] = 'non-public'

    if len(clean_data.keys()) > 0:  # if '_clean_data' has keys
        dataset['_clean_data'] = clean_data  # update dataset
    else:  # else no keys
        del dataset['_clean_data']  # delete '_clean_data' key from dataset

    return dataset
示例#2
0
def _transform_scraped_dataset(data, target_dept):

    dataset = Dataset()

    scraped_from = data.get('source_url')
    if '|' in scraped_from:
        scraped_from = scraped_from.split('|')[0]

    dataset.scraped_from = scraped_from

    ### removing leading and trailing withespaces from title
    title = data.get('title').strip()
    if title and title not in dataset_title_list:
        dataset.title = title
        dataset_title_list.append(title)
    else:
        dataset.title = h.transform_dataset_title(title, scraped_from)

    identifier = data.get('name')
    if identifier in dataset_identifier_list:
        identifier = h.transform_dataset_identifier(title, scraped_from)
    dataset.identifier = identifier
    dataset_identifier_list.append(identifier)

    if data.get('tags'):
        dataset.keyword = h.transform_keywords(data.get('tags'))

    if data.get('notes'):
        dataset.description = data.get('notes')

    if data.get('date'):
        dataset.modified = data.get('date')

    publisher = Organization()
    publisher.name = h.get_office_name(target_dept)
    dataset.publisher = publisher

    contactPoint = {
        "@type": "vcard:Contact",
    }

    if data.get('contact_person_name'):
        contactPoint['fn'] = data.get('contact_person_name')
    else:
        contactPoint['fn'] = 'n/a'  #h.get_office_name(target_dept)

    if data.get('contact_person_email'):
        contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email')
    else:
        contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov'

    dataset.contactPoint = contactPoint

    if data.get('accessLevel'):
        dataset.accessLevel = data.get('accessLevel')

    if not len(dataset.bureauCode) > 0:
        dataset.bureauCode = ["018:40"]

    if not len(dataset.programCode) > 0:
        dataset.programCode = ["018:000"]

    if not len(dataset.keyword) > 0:
        dataset.keyword = [target_dept]

    distributions = []
    resources = data.get('resources')
    for resource in resources:
        distribution = _transform_scraped_resource(target_dept, resource)
        distributions.append(distribution)

    dataset.distribution = distributions

    return dataset
示例#3
0
def _transform_scraped_dataset(data: dict, target_dept='all'):

    # check if 'data' has sanitised data to be adopted
    if data.get('_clean_data', None):
        # there is sanitised data to be adopted into the datajson
        if data['_clean_data'].get('_remove_dataset', False) is True:
            # the 'data' has been flagged for removal
            return None  # exit function with no Dataset instance
        else:
            # update 'data' with the keys/value from _clean_data
            data.update(data['_clean_data'])

    dataset = Dataset()

    scraped_from = data.get('source_url')
    if '|' in scraped_from:
        scraped_from = scraped_from.split('|')[0]

    dataset.scraped_from = scraped_from

    ### removing leading and trailing withespaces from title
    title = data.get('title').strip()
    # ensure datasets have a unique title
    if title and title not in dataset_title_list:
        dataset.title = title
        dataset_title_list.append(title)
    else:
        dataset.title = h.transform_dataset_title(title, scraped_from)

    identifier = data.get('name')
    # ensure datasets have a unique identifier
    if identifier in dataset_identifier_list:
        identifier = h.transform_dataset_identifier(title, scraped_from)
    dataset.identifier = identifier
    dataset_identifier_list.append(identifier)

    if data.get('tags'):
        dataset.keyword = h.transform_keywords(data.get('tags'))

    if data.get('notes'):
        dataset.description = data.get('notes')

    if data.get('date'):
        dataset.modified = data.get('date')

    publisher = Organization()
    publisher.name = h.get_office_name(target_dept)
    dataset.publisher = publisher

    contactPoint = {
        "@type": "vcard:Contact",
    }

    if data.get('contact_person_name'):
        contactPoint['fn'] = data.get('contact_person_name')
    else:
        contactPoint['fn'] = 'n/a'  #h.get_office_name(target_dept)

    if data.get('contact_person_email'):
        contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email')
    else:
        contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov'

    dataset.contactPoint = contactPoint

    if data.get('accessLevel'):
        dataset.accessLevel = data.get('accessLevel')

    if not len(dataset.bureauCode) > 0:
        dataset.bureauCode = ["018:40"]

    if not len(dataset.programCode) > 0:
        dataset.programCode = ["018:000"]

    if not len(dataset.keyword) > 0:
        dataset.keyword = [(target_dept or 'all')]

    distributions = []
    resources = data.get('resources')
    for resource in resources:
        distribution = _transform_scraped_resource(target_dept, resource)
        distributions.append(distribution)

    dataset.distribution = distributions

    # get the 'source' attribute for the dataset object
    dataset_source = _transform_scraped_source(data)
    if dataset_source:
        dataset.source.append(dataset_source)

    # get the 'collection' attribute for the dataset object
    dataset_collection = _transform_scraped_collection(data)
    if dataset_collection:
        dataset.collection.append(dataset_collection)

    # get levelOfData
    if data.get('level_of_data', None):
        dataset.levelOfData = data.get('level_of_data')

    return dataset
示例#4
0
def _transform_scraped_dataset(data: dict, target_dept='all'):

    # check if 'data' has sanitised data to be adopted
    if data.get('_clean_data', None):
        # there is sanitised data to be adopted into the datajson
        if data['_clean_data'].get('_remove_dataset', False) is True:
            # the 'data' has been flagged for removal
            return None  # exit function with no Dataset instance
        else:
            # update 'data' with the keys/value from _clean_data
            data.update(data['_clean_data'])

    dataset = Dataset()

    scraped_from = data.get('source_url')
    if '|' in scraped_from:
        scraped_from = scraped_from.split('|')[0]

    dataset.scraped_from = scraped_from

    ### removing leading and trailing withespaces from title
    title = data.get('title').strip()
    # ensure datasets have a unique title
    if title and title not in dataset_title_list:
        dataset.title = title
        dataset_title_list.append(title)
    else:
        dataset.title = h.transform_dataset_title(title, scraped_from)

    identifier = data.get('name')
    # ensure datasets have a unique identifier
    if identifier in dataset_identifier_list:
        identifier = h.transform_dataset_identifier(title, scraped_from)
    dataset.identifier = identifier
    dataset_identifier_list.append(identifier)

    if data.get('groups'):
        dataset.theme = data.get('groups')

    if data.get('tags'):
        dataset.keyword = h.transform_keywords(data.get('tags'))

    if data.get('notes'):
        dataset.description = data.get('notes')

    if data.get('date'):
        dataset.modified = data.get('date')

    publisher = Organization()
    if type(data.get('publisher')) is dict:
        publisher.name = data.get('publisher', {'name': 'edgov'})['name']
        publisher.sub_organization_of = data.get('publisher', {}).get(
            'subOrganizationOf', None)
    else:
        # if no publisher present, use the target_dept - part after last dot, if applicable
        # (e.g. both "oese" and "edgov.oese" will yield "oese")
        if data.get('publisher') in h.map_office_name.values():
            publisher.name = list(h.map_office_name.keys())[list(
                h.map_office_name.values()).index(data.get('publisher'))]
        else:
            publisher.name = data.get('publisher', target_dept.split('.')[-1])
    dataset.publisher = publisher

    contactPoint = {
        "@type": "vcard:Contact",
    }

    if data.get('contact_person_name'):
        contactPoint['fn'] = data.get('contact_person_name')
    else:
        contactPoint['fn'] = 'n/a'  #h.get_office_name(target_dept)

    if data.get('contact_person_email'):
        contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email')
    else:
        if target_dept == 'edgov':
            try:
                contactPoint[
                    'hasEmail'] = f"mailto:{data['publisher']['name']}@ed.gov"
            except Exception as e:
                contactPoint['hasEmail'] = f"mailto:{data['publisher']}@ed.gov"
        else:
            contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov'

    dataset.contactPoint = contactPoint

    if data.get('accessLevel'):
        dataset.accessLevel = data.get('accessLevel')

    if not len(dataset.bureauCode) > 0:
        dataset.bureauCode = ["018:00"]

    if not len(dataset.programCode) > 0:
        dataset.programCode = ["018:000"]

    if not len(dataset.keyword) > 0:
        dataset.keyword = [(target_dept or 'all')]

    distributions = []
    resources = data.get('resources')
    for resource in resources:
        distribution = _transform_scraped_resource(target_dept, resource)
        distributions.append(distribution)

    dataset.distribution = distributions

    # get levelOfData
    if data.get('level_of_data', None):
        dataset.levelOfData = data.get('level_of_data')

    if data.get('collection'):
        # get the 'source' attribute for the dataset object
        for collection in data.get('collection', []):
            dataset_source = _transform_scraped_source(
                dict(collection=collection))
            if len(dataset_source) > 0:
                dataset.source.extend(dataset_source)

        # get the 'collection' attribute for the dataset object
        for collection in data.get('collection', []):
            dataset_collection = _transform_scraped_collection(
                dict(collection=collection))
            if dataset_collection:
                dataset.collection.append(dataset_collection)

    return dataset