def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': new_key = self.rename_map[k] if not (new_key == 'title' and v == '0'): return (new_key, v) json = common.transform_dict(value, _cleaner) json['type'] = self.doc_lookup[json['type_id']] del json['type_id'] # Convert date to format used throughout openFDA (yyyymmdd) json['date'] = arrow.get(json['date']).strftime("%Y%m%d") json['url'] = common.convert_unicode(json['url']) # Assign application number as the key, since all three drugs@FDA files can be joined by this key. key = build_submissions_key(json['application_number'], json) del json['application_number'], json['submission_number'], json[ 'submission_type'] output.add(key, json)
def map(self, key, value, output): IGNORE = ['physicalstate', 'technicalmethod', 'targetarea'] # Changing names to match the openFDA naming standard # key = source name, value = replacement name RENAME_MAP = { 'productcode': 'product_code', 'reviewcode': 'review_code', 'regulationnumber': 'regulation_number', 'devicename': 'device_name', 'medicalspecialty': 'medical_specialty', 'thirdpartyflag': 'third_party_flag', 'gmpexemptflag': 'gmp_exempt_flag', 'deviceclass': 'device_class' } MEDICAL_SPECIALTY = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE def _cleaner(k, v): ''' A helper function used for removing and renaming dictionary keys. ''' k = k.lower() if k in IGNORE: return None if k in RENAME_MAP: k = RENAME_MAP[k] if k == 'medical_specialty': tk, tv = 'medical_specialty_description', MEDICAL_SPECIALTY[v] return [(k, v), (tk, tv)] return (k, v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def reduce(self, key, values, output): # There are lot of keys that we do not need once all the joining has been # done, so we can now transform the output and remove the join keys and # changes some of the names to be in line with naming conventions. IGNORE = [ 'key_val', 'address_id', 'address_type_id', 'contact_id', 'reg_key', 'listing_prop_id', 'listing_prop_name_id', 'registration_listing_id', 'establishment' ] RENAME_MAP = { 'address_line1': 'address_1', 'address_line2': 'address_2', 'reg_status_id': 'status_code', 'state_id': 'state_code' } def _prune(k, v): ''' A helper function used for removing and renaming dictionary keys. ''' if k in IGNORE: return None if k in RENAME_MAP: k = RENAME_MAP[k] return (k, v) key_prefix = 'final_result:' + key val = self._join(values) if val: for i, row in enumerate(val): new_key = key_prefix + ':' + str(i) new_value = common.transform_dict(row, _prune) output.put(new_key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': new_key = self.rename_map[k] if new_key in self.VALUE_MAPPINGS and v in self.VALUE_MAPPINGS[ new_key]: v = self.VALUE_MAPPINGS[new_key][v] return (new_key, v) json = common.transform_dict(value, _cleaner) # Turn active ingredients into an array of objects as per the mapping. if json.get('active_ingredients'): ingredientList = re.sub(';\s+', ';', json['active_ingredients']).split(';') json['active_ingredients'] = [] strengthList = re.sub( ';\s+', ';', json['strength']).split(';') if json.get('strength') else [] for idx, name in enumerate(ingredientList): ingredient = {'name': name} if len(strengthList) > idx: ingredient['strength'] = strengthList[idx] json['active_ingredients'].append(ingredient) else: # Delete to avoid complaints from Elasticsearch. if json.get('active_ingredients') is not None: del json['active_ingredients'] if json.get('strength') is not None: del json['strength'] # Split dosage and form into two distinct fields. if json.get('df_and_route') and len( json['df_and_route'].split(';')) == 2: json['dosage_form'] = json['df_and_route'].split(';')[0].strip() json['route'] = json['df_and_route'].split(';')[1].strip() # Sometimes the entire entry is Unknown. Indicate this for both df & route. elif json.get('df_and_route') and "UNKNOWN" in json['df_and_route']: json['dosage_form'] = json['df_and_route'] json['route'] = json['df_and_route'] # Sometimes the entire only contains dosage form. else: json['dosage_form'] = json['df_and_route'] json['route'] = None # Delete the field either way del json['df_and_route'] # Assign application number as the key, since all three drugs@FDA files can be joined by this key. key = build_products_key(json['application_number'], json) del json['application_number'] output.add(key, json)
def map(self, key, value, output): def cleaner(k, v): if k in RENAME_MAP: k = RENAME_MAP[k] if k in DATE_KEYS: if not v: return None v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD') return (k, v) val = common.transform_dict(value, cleaner) # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] if val.get('product-type') == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # There is not a decent ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] is not None: output.add(doc_id, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = common.convert_unicode(v.strip()) if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': return (self.rename_map[k], v) json = common.transform_dict(value, _cleaner) if json.get('submission_class_code_id') and json.get( 'submission_class_code_id') is not None: json['submission_class_code'] = self.doc_lookup[ json['submission_class_code_id']][0] descr = self.doc_lookup[ json['submission_class_code_id']][1].rstrip() if descr: json['submission_class_code_description'] = descr del json['submission_class_code_id'] # Convert date to format used throughout openFDA (yyyymmdd) if json.get('submission_status_date'): json['submission_status_date'] = arrow.get( json['submission_status_date']).strftime("%Y%m%d") # Assign application number as the key, since all three drugs@FDA files can be joined by this key. key = build_submissions_key(json['application_number'], json) del json['application_number'] output.add(key, json)
def map(self, key, value, output): RENAME_MAP = { 'res event number': 'res_event_number', 'rcl products res number': 'product_res_number', 'rcl firm fei number': 'firm_fei_number', 'rcl event info date terminated': 'event_date_terminated', 'rcl event info root cause description': 'root_cause_description', 'rcl products product code': 'product_code', 'rcl products submission numbers': 'product_submission_numbers' } # Uses renamed fields DATE_KEYS = ['event_date_terminated'] def _cleaner(k, v): ''' A helper function that is used for renaming dictionary keys and formatting dates. ''' def is_other(data): if not common.get_p_number(data) and \ not common.get_k_number(data) and \ not data.startswith('G'): return True return False k = k.lower() if k in RENAME_MAP: k = RENAME_MAP[k] if k == 'product_submission_numbers': # We take this single key, split it on space and then return three new # keys and not the original. submissions = [s for s in v.split(' ') if s] k_numbers = [d for d in submissions if common.get_k_number(d)] pma_numbers = [ d for d in submissions if common.get_p_number(d) ] other = ' '.join( [d.strip() for d in submissions if is_other(d)]) return [('k_numbers', k_numbers), ('pma_numbers', pma_numbers), ('other_submission_description', other)] if v != None: if k in DATE_KEYS: # Elasticsearch cannot handle null dates, emit None so the wrapper # function `transform_dict` will omit it from its transformation if not v: return None v = arrow.get(v, 'YYYY/MM/DD HH:mm:ss').format('YYYY-MM-DD') else: return (k, v.strip()) return (k, v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def transform_dates(coll): DATE_KEYS = ['created_date'] def _replace_date(k, v): if k is None: return k = k.lower() if v is None: return (k, v) if k in DATE_KEYS: return (k, arrow.get(v, 'MM/DD/YYYY')\ .format('YYYY-MM-DD')) if isinstance(v, list): return (k, v) return (k, v.strip()) return common.transform_dict(coll, _replace_date)
def map(self, key, value, output): if len(value) < 1: return mdr_key = value[0] # Some of the files have headers, we will apply our own, so row that starts # with this value is safe to skip if 'MDR_REPORT_KEY' in mdr_key: return file_type = [s for s in CATEGORIES if s in self.filename][0] # logging.info('file type: %s, file: %s', file_type, self.filename) # TODO(hansnelsen): remove once file format is stable if file_type == 'mdrfoi': value = self.shuffle_mdrfoi_value(value) # We send all data anomalies to a reducer for each file type. # These non-conforming data are written to a reject file for review. # This file type as variable lengths over time, so it needs its owne check if file_type == 'foidev': if len(value) not in [28, 45]: logging.info('Does not conform to foidev structure. Skipping: %s, %s', mdr_key, '#' * 200) output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) return elif len(value) != len(FILE_HEADERS[file_type]): logging.info('Does not conform to %s structure. Skipping: %s, %s', file_type, mdr_key, '#' * 200) output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) return if not isinstance(int(mdr_key), int): logging.info('%s is not a number', mdr_key) output.add(file_type, '%s: NaN' % mdr_key + ':' + '|'.join(value)) return # If it makes it this far, it is a good record new_value = dict(zip(FILE_HEADERS[file_type], value)) new_value = common.transform_dict(new_value, self.cleaner) # https://github.com/FDA/openfda/issues/27 # We need to see if device problem code is available for this report in the # foidevproblem.txt file, resolve it to a problem description, and add it to the # master record. if file_type == 'mdrfoi': problem_codes = self.device_problem_codes.get(mdr_key) if problem_codes is not None: product_problems = [self.problem_codes_reference.get(code) for code in problem_codes] new_value['product_problems'] = product_problems output.add(mdr_key, (file_type, new_value))
def transform_dates(coll): DATE_KEYS = ['created_date'] def _replace_date(k, v): if k is None: return k = k.lower() if v is None: return (k, v) if k in DATE_KEYS: return (k, arrow.get(v, 'MM/DD/YYYY') .format('YYYY-MM-DD')) if isinstance(v, list): return (k, v) return (k, v.strip()) return common.transform_dict(coll, _replace_date)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' if k == 'PRODUCTID': v = v.split('_')[1] if k in self.rename_map: return (self.rename_map[k], v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' if k in self.booleans: v = v in ['Y', 'y'] if v is not None and v != '' else None v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': return (self.rename_map[k], v) json = common.transform_dict(value, _cleaner) # SPL ID json['spl_id'] = json['product_id'].split('_')[1] # Brand name parsing json['brand_name_base'] = json.get('brand_name') if json.get('brand_name_suffix'): json['brand_name'] = (json.get('brand_name') if json.get('brand_name') is not None else '') + ' ' + json[ 'brand_name_suffix'] # Route is a multi-value field easily parseable if json.get('route'): json['route'] = re.sub(';\s+', ';', json['route']).split(';') # Pharm class field is a multi-value field easily parseable if json.get('pharm_class'): json['pharm_class'] = re.sub(',\s+', ',', json['pharm_class']).split(',') # Turn active ingredients into an array of objects as per the mapping. if json.get('active_ingredients'): ingredientList = re.sub(';\s+', ';', json['active_ingredients']).split(';') json['active_ingredients'] = [] strengthList = re.sub(';\s+', ';', json['strength']).split(';') if json.get('strength') else [] unitList = re.sub(';\s+', ';', json['unit']).split(';') if json.get('unit') else [] for idx, name in enumerate(ingredientList): ingredient = {'name': name} if len(strengthList) > idx: ingredient['strength'] = strengthList[idx] + (' ' + unitList[idx] if len(unitList) > idx else '') json['active_ingredients'].append(ingredient) else: # Delete to avoid complaints from Elasticsearch. if json.get('active_ingredients') is not None: del json['active_ingredients'] if json.get('strength') is not None: del json['strength'] if json.get('unit') is not None: del json['unit'] output.add(key, json)
def map(self, key, value, output): RENAME_MAP = { 'res event number': 'res_event_number', 'rcl products res number': 'product_res_number', 'rcl firm fei number': 'firm_fei_number', 'rcl event info date terminated': 'event_date_terminated', 'rcl event info root cause description': 'root_cause_description', 'rcl products product code': 'product_code', 'rcl products submission numbers': 'product_submission_numbers' } # Uses renamed fields DATE_KEYS = ['event_date_terminated'] def _cleaner(k, v): ''' A helper function that is used for renaming dictionary keys and formatting dates. ''' def is_other(data): if not common.get_p_number(data) and \ not common.get_k_number(data) and \ not data.startswith('G'): return True return False k = k.lower() if k in RENAME_MAP: k = RENAME_MAP[k] if k == 'product_submission_numbers': # We take this single key, split it on space and then return three new # keys and not the original. submissions = [s for s in v.split(' ') if s] k_numbers = [d for d in submissions if common.get_k_number(d)] pma_numbers = [d for d in submissions if common.get_p_number(d)] other = ' '.join([d.strip() for d in submissions if is_other(d)]) return [('k_numbers', k_numbers), ('pma_numbers', pma_numbers), ('other_submission_description', other)] if v != None: if k in DATE_KEYS: # Elasticsearch cannot handle null dates, emit None so the wrapper # function `transform_dict` will omit it from its transformation if not v: return None v = arrow.get(v, 'YYYY/MM/DD HH:mm:ss').format('YYYY-MM-DD') else: return (k, v.strip()) return (k, v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' if k in self.booleans: v = v in ['Y', 'y'] if v is not None and v != '' else None v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': return (self.rename_map[k], v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): keepers = ['rxcui', 'rxstring', 'rxtty','setid', 'spl_version'] rename_map = { 'setid': 'spl_set_id' } def _cleaner(k, v): k = k.lower() if k in keepers: if k in rename_map: return (rename_map[k], v) return (k, v) new_value = common.transform_dict(value, _cleaner) new_key = new_value['spl_set_id'] + ':'+ new_value['spl_version'] output.add(new_key, new_value)
def map(self, key, value, output): keepers = ['rxcui', 'rxstring', 'rxtty', 'setid', 'spl_version'] rename_map = {'setid': 'spl_set_id'} def _cleaner(k, v): k = k.lower() if k in keepers: if k in rename_map: return (rename_map[k], v) return (k, v) new_value = common.transform_dict(value, _cleaner) new_key = new_value['spl_set_id'] + ':' + new_value['spl_version'] output.add(new_key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' if k in self.rename_map and v is not None and v != '': if "Date" in k: return (self.rename_map[k], str(int(v))) if "Proprietary Name" in k: return (self.rename_map[k], str(v).title()) else: return (self.rename_map[k], v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '' and v != 'Null': return (self.rename_map[k], v) json = common.transform_dict(value, _cleaner) # Assign application number as the key, since all three drugs@FDA files can be joined by this key. key = build_submissions_key(json['application_number'], json) del json['application_number'], json['submission_number'], json['submission_type'] if json != {}: output.add(key, json)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' if k in self.rename_map and v is not None: if "DATE" in k: return self.rename_map[k], str(arrow.get(v, 'M/D/YYYY').format("MM/DD/YYYY")) # Make arrays of these three fields. if k in ["TOBACCO_PRODUCTS", "REPORTED_HEALTH_PROBLEMS", "REPORTED_PRODUCT_PROBLEMS"]: return self.rename_map[k], list(set(v.split(' / '))) else: return self.rename_map[k], v new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def reduce(self, key, values, output): # There are lot of keys that we do not need once all the joining has been # done, so we can now transform the output and remove the join keys and # changes some of the names to be in line with naming conventions. IGNORE = ['key_val', 'address_id', 'address_type_id', 'contact_id', 'reg_key', 'listing_prop_id', 'listing_prop_name_id', 'registration_listing_id', 'establishment' ] RENAME_MAP = { 'address_line1': 'address_1', 'address_line2': 'address_2', 'reg_status_id': 'status_code', 'state_id': 'state_code' } EXPANSION_MAP = { 'establishment_type': 'establishment_type_exact', 'proprietary_name': 'proprietary_name_exact' } def _prune(k, v): ''' A helper function used for removing and renaming dictionary keys. ''' if k in IGNORE: return None if k in RENAME_MAP: k = RENAME_MAP[k] if k in EXPANSION_MAP: ek, ev = EXPANSION_MAP[k], v return [(k, v), (ek, ev)] return (k, v) key_prefix = 'final_result:' + key val = self._join(values) if val: for i, row in enumerate(val): new_key = key_prefix + ':' + str(i) new_value = common.transform_dict(row, _prune) output.put(new_key, new_value)
def map(self, key, value, output): # Date fields. DATE_KEYS = ['date_performed'] def _cleaner(k, v): ''' A helper function that is used formatting dates. ''' if v != None: if k in DATE_KEYS: # Elasticsearch cannot handle null dates, emit None so the wrapper # function `transform_dict` will omit it from its transformation v = arrow.get(v).format('M/D/YYYY') return (k, v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' # See https://github.com/FDA/openfda-dev/issues/6 # Handle bad Unicode characters that may come fron NDC product.txt. if isinstance(v, str): v = unicode(v, 'utf8', 'ignore').encode() if k == 'PRODUCTID': v = v.split('_')[1] if k in self.rename_map: return (self.rename_map[k], v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def reduce(self, key, values, output): # There are lot of keys that we do not need once all the joining has been # done, so we can now transform the output and remove the join keys and # changes some of the names to be in line with naming conventions. IGNORE = [ "key_val", "address_id", "address_type_id", "contact_id", "reg_key", "listing_prop_id", "listing_prop_name_id", "registration_listing_id", "establishment", ] RENAME_MAP = { "address_line1": "address_1", "address_line2": "address_2", "reg_status_id": "status_code", "state_id": "state_code", } EXPANSION_MAP = {"establishment_type": "establishment_type_exact", "proprietary_name": "proprietary_name_exact"} def _prune(k, v): """ A helper function used for removing and renaming dictionary keys. """ if k in IGNORE: return None if k in RENAME_MAP: k = RENAME_MAP[k] if k in EXPANSION_MAP: ek, ev = EXPANSION_MAP[k], v return [(k, v), (ek, ev)] return (k, v) key_prefix = "final_result:" + key val = self._join(values) if val: for i, row in enumerate(val): new_key = key_prefix + ":" + str(i) new_value = common.transform_dict(row, _prune) output.put(new_key, new_value)
def map(self, key, value, output): RENAME_MAP = { 'rcl products res number': 'product_res_number', 'rcl firm fei number': 'firm_fei_number', 'rcl products submission numbers': 'product_submission_numbers' } def _cleaner(k, v): ''' A helper function that is used for renaming dictionary keys and formatting dates. ''' def is_other(data): if not common.get_p_number(data) and \ not common.get_k_number(data) and \ not data.startswith('G'): return True return False k = k.lower() if k in RENAME_MAP: k = RENAME_MAP[k] else: return None if k == 'product_submission_numbers': # We take this single key, split it on space and then return three new # keys and not the original. submissions = [s for s in v.split(' ') if s] k_numbers = [d for d in submissions if common.get_k_number(d)] pma_numbers = [ d for d in submissions if common.get_p_number(d) ] other = ' '.join( [d.strip() for d in submissions if is_other(d)]) return [('k_numbers', k_numbers), ('pma_numbers', pma_numbers), ('other_submission_description', other)] if v != None: return (k, v.strip()) return (k, v) new_value = common.transform_dict(value, _cleaner) output.add(new_value['product_res_number'], new_value)
def map(self, key, value, output): if len(value) < 1: return mdr_key = value[0] # Some of the files have headers, we will apply our own, so row that starts # with this value is safe to skip if 'MDR_REPORT_KEY' in mdr_key: return file_type = [s for s in CATEGORIES if s in self.filename][0] # logging.info('file type: %s, file: %s', file_type, self.filename) # TODO(hansnelsen): remove once file format is stable if file_type == 'mdrfoi': value = self.shuffle_mdrfoi_value(value) # We send all data anomalies to a reducer for each file type. # These non-conforming data are written to a reject file for review. # This file type as variable lengths over time, so it needs its owne check if file_type == 'foidev': if len(value) not in [28, 45]: logging.info('Does not conform to foidev structure. Skipping: %s, %s', mdr_key, '#' * 200) output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) return elif len(value) != len(FILE_HEADERS[file_type]): logging.info('Does not conform to %s structure. Skipping: %s, %s', file_type, mdr_key, '#' * 200) output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) return if not isinstance(int(mdr_key), int): logging.info('%s is not a number', mdr_key) output.add(file_type, '%s: NaN' % mdr_key + ':' + '|'.join(value)) return # If it makes it this far, it is a good record new_value = dict(zip(FILE_HEADERS[file_type], value)) new_value = common.transform_dict(new_value, self.cleaner) output.add(mdr_key, (file_type, new_value))
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' # See https://github.com/FDA/openfda-dev/issues/6 # Handle bad Unicode characters that may come fron NDC product.txt. if isinstance(v, str): v = unicode(v, 'utf8', 'ignore').encode() if k in self.rename_map and v is not None: if "Date" in k: return (self.rename_map[k], str(int(v))) if "Proprietary Name" in k: return (self.rename_map[k], v.title()) else: return (self.rename_map[k], v) new_value = common.transform_dict(value, _cleaner) output.add(key, new_value)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': return (self.rename_map[k], v) json = common.transform_dict(value, _cleaner) if json.get('application_public_notes') != None: del json['application_public_notes'] if json.get('application_no') and json.get('application_type'): json['application_number'] = json.get('application_type') + json.get('application_no') del json['application_type'] del json['application_no'] output.add(key, json)
def map(self, key, value, output): def _cleaner(k, v): ''' Helper function to rename keys and purge any keys that are not in the map. ''' v = v.strip() if isinstance(v, str) else v if k in self.rename_map and v is not None and v != '': return (self.rename_map[k], v) json = common.transform_dict(value, _cleaner) if json.get('marketing_status_id'): json['marketing_status'] = self.doc_lookup[json['marketing_status_id']] del json['marketing_status_id'] # Assign application number as the key, since all three drugs@FDA files can be joined by this key. key = build_products_key(json['application_number'], json) del json['application_number'], json['product_number'] output.add(key, json)
def handle_device(_, xml): """Transform the dictionary, which follows the structure of UDI XML files, to a new one that matches the UDI mapping in ES. """ try: device = common.transform_dict(xml, transformer_fn) # Additional transformation of the dictionary not handled by the transformer_fn function. if "id" in device: del device["id"] if "device_sizes" in device: for size in device["device_sizes"]: if "value" in size: value_dict = size["value"] del size["value"] if "value" in value_dict: size["value"] = value_dict["value"] if "unit" in value_dict: size["unit"] = value_dict["unit"] # print(json.dumps(device, indent=4 * ' ')) # @id will be a concatenation of primary identifier's issuer with the ID number itself. for identifier, idenList in iter(xml["identifiers"].items()): if type(idenList) == type([]): for iden in idenList: if 'deviceIdType' in iden and 'deviceIdIssuingAgency' in iden and 'Primary' == iden[ 'deviceIdType']: device["@id"] = (('' if 'deviceIdIssuingAgency' not in iden else iden['deviceIdIssuingAgency'] + "_") + \ iden["deviceId"]).lower() map_output.add(device["@id"], device) return True except Exception: logging.error(xml) traceback.print_exc() logging.error(sys.exc_info()[0]) raise
def handle_device(_, xml): """Transform the dictionary, which follows the structure of UDI XML files, to a new one that matches the UDI mapping in ES. """ try: device = common.transform_dict(xml, transformer_fn) # Additional transformation of the dictionary not handled by the transformer_fn function. if "id" in device: del device["id"] if "device_sizes" in device: for size in device["device_sizes"]: if "value" in size: value_dict = size["value"] del size["value"] if "value" in value_dict: size["value"] = value_dict["value"] if "unit" in value_dict: size["unit"] = value_dict["unit"] # print(json.dumps(device, indent=4 * ' ')) # @id will be a concatenation of primary identifier's issuer with the ID number itself. for identifier, idenList in xml["identifiers"].iteritems(): if type(idenList) == type([]): for iden in idenList: if 'deviceIdType' in iden and 'deviceIdIssuingAgency' in iden and 'Primary' == iden['deviceIdType']: device["@id"] = (('' if 'deviceIdIssuingAgency' not in iden else iden['deviceIdIssuingAgency'] + "_") + \ iden["deviceId"]).lower() map_output.add(device["@id"], device) return True except Exception: logging.error(xml) traceback.print_exc() logging.error(sys.exc_info()[0]) raise
def map(self, key, value, output): new_value = common.transform_dict(value, self.cleaner) new_key = new_value['report_number'] output.add(new_key, new_value)
def transform_device_pma(csv_dict): return common.transform_dict(csv_dict, _cleaner)
def transform_device_clearance(csv_dict): return common.transform_dict(csv_dict, _cleaner)
def omit_internal_keys(data): ''' Cleaner function to pass to the dump_index command and is used as a json.load(..., object_hook=omit_internal_keys). ''' return common.transform_dict(data, basic_cleaner)
def map(self, key, value, output): def _cleaner(k, v): if v == None: return None if k == 'recall_status': fullmatch = self.TERMINATED_RE.fullmatch(v) if fullmatch: return [('recall_status', 'Terminated'), ('event_date_terminated', self.reformat_date(fullmatch.group(1)))] if k == 'recalling_firm': return parse_address(v) return (k, v) def parse_address(addr): ''' Attempt to parse the Recalling Firm/Manufacturer piece into firm name and address components using usaddress library. If unable to parse, stuff everything back into the 'recalling_firm' element. ''' def _concat(addr, part_keys): parts = [] for key in part_keys: if addr.get(key) is not None: parts.append(addr.get(key)) return ' '.join(parts) if len(addr.splitlines()) >= 2: try: tagged = usaddress.tag(addr.replace('\n', ' ')) return [ x for x in [('recalling_firm', tagged[0].get('Recipient')), ('address_1', _concat(tagged[0], [ 'AddressNumberPrefix', 'AddressNumber', 'AddressNumberSuffix', 'StreetNamePreModifier', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType', 'StreetNamePostDirectional', 'USPSBoxType', 'USPSBoxID', 'USPSBoxGroupType', 'USPSBoxGroupID' ])), ('address_2', _concat(tagged[0], [ 'SubaddressType', 'SubaddressIdentifier', 'BuildingName', 'OccupancyType', 'OccupancyIdentifier', 'CornerOf', 'LandmarkName', 'IntersectionSeparator', 'NotAddress' ])), ('city', tagged[0].get('PlaceName') ), ('state', tagged[0].get('StateName')), ('postal_code', tagged[0].get('ZipCode') ), ('country', tagged[0].get('CountryName'))] if x[1] is not None and x[1] != '' ] except usaddress.RepeatedLabelError: pass return 'recalling_firm', addr id = value['id'] recall = {} url = RECALL_RECORD_DOWNLOAD_URL % (id) soup = soup_with_retry(url) recall['@id'] = id recall_number = self.field_val_str(soup, 'Recall Number') recall['product_res_number'] = recall_number recall['event_date_initiated'] = self.field_val_date( soup, 'Date Initiated by Firm') recall['event_date_created'] = self.field_val_date(soup, 'Create Date') recall['event_date_posted'] = self.field_val_date(soup, 'Date Posted') recall['recall_status'] = self.field_val_str(soup, 'Recall Status1') recall['res_event_number'] = self.field_val_str( soup, 'Recall Event ID') product_code = self.field_val_str(soup, 'Product Classification') if product_code: recall['product_code'] = re.search(r'Product Code\s+(\S+)', product_code).group(1).replace( 'ooo', 'N/A') recall['k_numbers'] = self.field_val_array(soup, '510(K)Number') recall['pma_numbers'] = self.field_val_array(soup, 'PMA Number') recall['product_description'] = self.field_val_str(soup, 'Product') recall['code_info'] = self.field_val_str(soup, 'Code Information') recall['recalling_firm'] = self.field_val_str( soup, 'Recalling Firm/Manufacturer') recall['additional_info_contact'] = self.field_val_str( soup, 'For Additional Information Contact') recall['reason_for_recall'] = self.field_val_str( soup, 'Manufacturer Reasonfor Recall') recall['root_cause_description'] = self.field_val_str( soup, 'FDA DeterminedCause 2') recall['action'] = self.field_val_str(soup, 'Action') recall['product_quantity'] = self.field_val_str( soup, 'Quantity in Commerce') recall['distribution_pattern'] = self.field_val_str( soup, 'Distribution') # Add data from the "old" CSV file. csv_record = self.csv2json_db.get(recall_number, None) if csv_record is not None: if len(csv_record['firm_fei_number']) > 1: recall['firm_fei_number'] = csv_record['firm_fei_number'] if len(csv_record['other_submission_description']) > 0: recall['other_submission_description'] = csv_record[ 'other_submission_description'] if recall.get('k_numbers') is None and len( csv_record.get('k_numbers')) > 0: recall['k_numbers'] = csv_record['k_numbers'] if recall.get('pma_numbers') is None and len( csv_record.get('pma_numbers')) > 0: recall['pma_numbers'] = csv_record['pma_numbers'] xformed_recall = common.transform_dict(recall, _cleaner) output.add(key, xformed_recall)