def _mark_private(dataset: dict, search_words=[], add_word_tag=True, mark_as_private=True) -> dict: """ private helper function. FUNCTION ONLY adds/modifies the '_clean_data' key of 'dataset' during operations. searches the dataset title for provided search_words. if any search_words are found, will OPTIONALLY marks the datasets as private and adds the search word as a tag to the list of keywords""" if dataset.get('title') is None: return dataset dataset['title'] = dataset['title'].strip() # get the '_clean_data' key of dataset clean_data = dataset.setdefault('_clean_data', {}) # get the title key from clean_data or use those from dataset title = clean_data.get('title', dataset['title'].strip()) # get the tags key from clean_data or use those from dataset tags = clean_data.get('tags', dataset.get('tags', '').strip()) for word in search_words: # check if word is in dataset title if re.search('\\b' + re.escape(word) + '\\b', title, re.IGNORECASE): # word is in title if add_word_tag and word.lower().replace(' ', '-')\ not in h.transform_keywords(tags): # word needs to be added to tag tags = h.transform_keywords(tags) tags.append(word.lower().replace(' ', '-')) tags = ';'.join(tags) clean_data['tags'] = tags # mark dataset as 'private; if mark_as_private: clean_data['accessLevel'] = 'non-public' if len(clean_data.keys()) > 0: # if '_clean_data' has keys dataset['_clean_data'] = clean_data # update dataset else: # else no keys del dataset['_clean_data'] # delete '_clean_data' key from dataset return dataset
def _transform_scraped_dataset(data, target_dept): dataset = Dataset() scraped_from = data.get('source_url') if '|' in scraped_from: scraped_from = scraped_from.split('|')[0] dataset.scraped_from = scraped_from ### removing leading and trailing withespaces from title title = data.get('title').strip() if title and title not in dataset_title_list: dataset.title = title dataset_title_list.append(title) else: dataset.title = h.transform_dataset_title(title, scraped_from) identifier = data.get('name') if identifier in dataset_identifier_list: identifier = h.transform_dataset_identifier(title, scraped_from) dataset.identifier = identifier dataset_identifier_list.append(identifier) if data.get('tags'): dataset.keyword = h.transform_keywords(data.get('tags')) if data.get('notes'): dataset.description = data.get('notes') if data.get('date'): dataset.modified = data.get('date') publisher = Organization() publisher.name = h.get_office_name(target_dept) dataset.publisher = publisher contactPoint = { "@type": "vcard:Contact", } if data.get('contact_person_name'): contactPoint['fn'] = data.get('contact_person_name') else: contactPoint['fn'] = 'n/a' #h.get_office_name(target_dept) if data.get('contact_person_email'): contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email') else: contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov' dataset.contactPoint = contactPoint if data.get('accessLevel'): dataset.accessLevel = data.get('accessLevel') if not len(dataset.bureauCode) > 0: dataset.bureauCode = ["018:40"] if not len(dataset.programCode) > 0: dataset.programCode = ["018:000"] if not len(dataset.keyword) > 0: dataset.keyword = [target_dept] distributions = [] resources = data.get('resources') for resource in resources: distribution = _transform_scraped_resource(target_dept, resource) distributions.append(distribution) dataset.distribution = distributions return dataset
def _transform_scraped_dataset(data: dict, target_dept='all'): # check if 'data' has sanitised data to be adopted if data.get('_clean_data', None): # there is sanitised data to be adopted into the datajson if data['_clean_data'].get('_remove_dataset', False) is True: # the 'data' has been flagged for removal return None # exit function with no Dataset instance else: # update 'data' with the keys/value from _clean_data data.update(data['_clean_data']) dataset = Dataset() scraped_from = data.get('source_url') if '|' in scraped_from: scraped_from = scraped_from.split('|')[0] dataset.scraped_from = scraped_from ### removing leading and trailing withespaces from title title = data.get('title').strip() # ensure datasets have a unique title if title and title not in dataset_title_list: dataset.title = title dataset_title_list.append(title) else: dataset.title = h.transform_dataset_title(title, scraped_from) identifier = data.get('name') # ensure datasets have a unique identifier if identifier in dataset_identifier_list: identifier = h.transform_dataset_identifier(title, scraped_from) dataset.identifier = identifier dataset_identifier_list.append(identifier) if data.get('tags'): dataset.keyword = h.transform_keywords(data.get('tags')) if data.get('notes'): dataset.description = data.get('notes') if data.get('date'): dataset.modified = data.get('date') publisher = Organization() publisher.name = h.get_office_name(target_dept) dataset.publisher = publisher contactPoint = { "@type": "vcard:Contact", } if data.get('contact_person_name'): contactPoint['fn'] = data.get('contact_person_name') else: contactPoint['fn'] = 'n/a' #h.get_office_name(target_dept) if data.get('contact_person_email'): contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email') else: contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov' dataset.contactPoint = contactPoint if data.get('accessLevel'): dataset.accessLevel = data.get('accessLevel') if not len(dataset.bureauCode) > 0: dataset.bureauCode = ["018:40"] if not len(dataset.programCode) > 0: dataset.programCode = ["018:000"] if not len(dataset.keyword) > 0: dataset.keyword = [(target_dept or 'all')] distributions = [] resources = data.get('resources') for resource in resources: distribution = _transform_scraped_resource(target_dept, resource) distributions.append(distribution) dataset.distribution = distributions # get the 'source' attribute for the dataset object dataset_source = _transform_scraped_source(data) if dataset_source: dataset.source.append(dataset_source) # get the 'collection' attribute for the dataset object dataset_collection = _transform_scraped_collection(data) if dataset_collection: dataset.collection.append(dataset_collection) # get levelOfData if data.get('level_of_data', None): dataset.levelOfData = data.get('level_of_data') return dataset
def _transform_scraped_dataset(data: dict, target_dept='all'): # check if 'data' has sanitised data to be adopted if data.get('_clean_data', None): # there is sanitised data to be adopted into the datajson if data['_clean_data'].get('_remove_dataset', False) is True: # the 'data' has been flagged for removal return None # exit function with no Dataset instance else: # update 'data' with the keys/value from _clean_data data.update(data['_clean_data']) dataset = Dataset() scraped_from = data.get('source_url') if '|' in scraped_from: scraped_from = scraped_from.split('|')[0] dataset.scraped_from = scraped_from ### removing leading and trailing withespaces from title title = data.get('title').strip() # ensure datasets have a unique title if title and title not in dataset_title_list: dataset.title = title dataset_title_list.append(title) else: dataset.title = h.transform_dataset_title(title, scraped_from) identifier = data.get('name') # ensure datasets have a unique identifier if identifier in dataset_identifier_list: identifier = h.transform_dataset_identifier(title, scraped_from) dataset.identifier = identifier dataset_identifier_list.append(identifier) if data.get('groups'): dataset.theme = data.get('groups') if data.get('tags'): dataset.keyword = h.transform_keywords(data.get('tags')) if data.get('notes'): dataset.description = data.get('notes') if data.get('date'): dataset.modified = data.get('date') publisher = Organization() if type(data.get('publisher')) is dict: publisher.name = data.get('publisher', {'name': 'edgov'})['name'] publisher.sub_organization_of = data.get('publisher', {}).get( 'subOrganizationOf', None) else: # if no publisher present, use the target_dept - part after last dot, if applicable # (e.g. both "oese" and "edgov.oese" will yield "oese") if data.get('publisher') in h.map_office_name.values(): publisher.name = list(h.map_office_name.keys())[list( h.map_office_name.values()).index(data.get('publisher'))] else: publisher.name = data.get('publisher', target_dept.split('.')[-1]) dataset.publisher = publisher contactPoint = { "@type": "vcard:Contact", } if data.get('contact_person_name'): contactPoint['fn'] = data.get('contact_person_name') else: contactPoint['fn'] = 'n/a' #h.get_office_name(target_dept) if data.get('contact_person_email'): contactPoint['hasEmail'] = "mailto:" + data.get('contact_person_email') else: if target_dept == 'edgov': try: contactPoint[ 'hasEmail'] = f"mailto:{data['publisher']['name']}@ed.gov" except Exception as e: contactPoint['hasEmail'] = f"mailto:{data['publisher']}@ed.gov" else: contactPoint['hasEmail'] = f'mailto:{target_dept}@ed.gov' dataset.contactPoint = contactPoint if data.get('accessLevel'): dataset.accessLevel = data.get('accessLevel') if not len(dataset.bureauCode) > 0: dataset.bureauCode = ["018:00"] if not len(dataset.programCode) > 0: dataset.programCode = ["018:000"] if not len(dataset.keyword) > 0: dataset.keyword = [(target_dept or 'all')] distributions = [] resources = data.get('resources') for resource in resources: distribution = _transform_scraped_resource(target_dept, resource) distributions.append(distribution) dataset.distribution = distributions # get levelOfData if data.get('level_of_data', None): dataset.levelOfData = data.get('level_of_data') if data.get('collection'): # get the 'source' attribute for the dataset object for collection in data.get('collection', []): dataset_source = _transform_scraped_source( dict(collection=collection)) if len(dataset_source) > 0: dataset.source.extend(dataset_source) # get the 'collection' attribute for the dataset object for collection in data.get('collection', []): dataset_collection = _transform_scraped_collection( dict(collection=collection)) if dataset_collection: dataset.collection.append(dataset_collection) return dataset