def test_extract_ndc_5_4_2(self): expected_ndc = '00591-0369-01' xml = open_data_file('ndc-5-4-2.xml') xml_dict = xmltodict.parse(xml) extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number']) self.assertEqual(expected_ndc, extracted_ndcs[0], 'ndc-5-4-2.xml')
def test_extract_ndc_5_3_2(self): expected_ndc = '10631-106-08' xml = open_data_file('ndc-5-3-2.xml') xml_dict = xmltodict.parse(xml) extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number']) self.assertEqual(expected_ndc, extracted_ndcs[0], 'ndc-5-3-2.xml')
def map(self, key, value, output): # These keys must exist in the JSON records for the annotation logic to work logic_keys = [ 'code-info', 'report-date', 'product-description' ] val = json.loads(value) if val['product-type'] == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # There is no good ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] != None: output.add(key, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))
def run(self): output_dir = self.output().path os.system('mkdir -p "%s"' % output_dir) current_event_files = glob.glob(self.input()[0].path + '/*.json') historic_event_files = glob.glob(self.input()[1].path + '/*.json') all_files = current_event_files + historic_event_files output_file = self.output().path + '/all_res.json' out = open(output_file, 'w') # These keys must exist in the JSON records for the annotation logic to work logic_keys = [ 'code-info', 'report-date', 'product-description' ] for filename in all_files: json_file = open(filename, 'r') for row in json_file: record = json.loads(row) if record['product-type'] == 'Drugs': record['upc'] = extract.extract_upc_from_recall(record) record['ndc'] = extract.extract_ndc_from_recall(record) # Only write out records that have required keys and a meaningful date if set(logic_keys).issubset(record) and record['report-date'] != None: out.write(json.dumps(record) + '\n')
def map(self, key, value, output): def cleaner(k, v): if k in RENAME_MAP: k = RENAME_MAP[k] if k in DATE_KEYS: if not v: return None v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD') return (k, v) val = common.transform_dict(value, cleaner) # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] if val.get('product-type') == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # There is not a decent ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] is not None: output.add(doc_id, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))
def run(self): output_dir = self.output().path os.system('mkdir -p "%s"' % output_dir) current_event_files = glob.glob(self.input()[0].path + '/*.json') historic_event_files = glob.glob(self.input()[1].path + '/*.json') all_files = current_event_files + historic_event_files output_file = self.output().path + '/all_res.json' out = open(output_file, 'w') # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] for filename in all_files: json_file = open(filename, 'r') for row in json_file: record = json.loads(row) if record['product-type'] == 'Drugs': record['upc'] = extract.extract_upc_from_recall(record) record['ndc'] = extract.extract_ndc_from_recall(record) # Only write out records that have required keys and a meaningful date if set(logic_keys).issubset( record) and record['report-date'] != None: out.write(json.dumps(record) + '\n')
def map(self, key, value, output): # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] for val in value['recall-number']: if val['product-type'] == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # Copy the recall-number attribute value to an actual field # The recall-number is not a reliable id, since it repeats val['recall-number'] = val['@id'] # There is no good ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] != None: output.add(doc_id, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))