Пример #1
0
 def test_extract_ndc_5_4_2(self):
   expected_ndc = '00591-0369-01'
   xml = open_data_file('ndc-5-4-2.xml')
   xml_dict = xmltodict.parse(xml)
   extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number'])
   self.assertEqual(expected_ndc, extracted_ndcs[0],
                    'ndc-5-4-2.xml')
Пример #2
0
 def test_extract_ndc_5_3_2(self):
   expected_ndc = '10631-106-08'
   xml = open_data_file('ndc-5-3-2.xml')
   xml_dict = xmltodict.parse(xml)
   extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number'])
   self.assertEqual(expected_ndc, extracted_ndcs[0],
                    'ndc-5-3-2.xml')
Пример #3
0
  def map(self, key, value, output):
    # These keys must exist in the JSON records for the annotation logic to work
    logic_keys = [
      'code-info',
      'report-date',
      'product-description'
    ]

    val = json.loads(value)

    if val['product-type'] == 'Drugs':
      val['upc'] = extract.extract_upc_from_recall(val)
      val['ndc'] = extract.extract_ndc_from_recall(val)

    # There is no good ID for the report, so we need to make one
    doc_id = self._hash(json.dumps(val, sort_keys=True))
    val['@id'] = doc_id
    val['@version'] = 1

    # Only write out vals that have required keys and a meaningful date
    if set(logic_keys).issubset(val) and val['report-date'] != None:
      output.add(key, val)
    else:
      logging.warn('Docuemnt is missing required fields. %s',
                   json.dumps(val, indent=2, sort_keys=True))
Пример #4
0
  def run(self):
    output_dir = self.output().path
    os.system('mkdir -p "%s"' % output_dir)
    current_event_files = glob.glob(self.input()[0].path + '/*.json')
    historic_event_files = glob.glob(self.input()[1].path + '/*.json')
    all_files = current_event_files + historic_event_files
    output_file = self.output().path + '/all_res.json'
    out = open(output_file, 'w')
    # These keys must exist in the JSON records for the annotation logic to work
    logic_keys = [
      'code-info',
      'report-date',
      'product-description'
    ]

    for filename in all_files:
      json_file = open(filename, 'r')
      for row in json_file:
        record = json.loads(row)
        if record['product-type'] == 'Drugs':
          record['upc'] = extract.extract_upc_from_recall(record)
          record['ndc'] = extract.extract_ndc_from_recall(record)
        # Only write out records that have required keys and a meaningful date
        if set(logic_keys).issubset(record) and record['report-date'] != None:
          out.write(json.dumps(record) + '\n')
Пример #5
0
    def map(self, key, value, output):
        def cleaner(k, v):
            if k in RENAME_MAP:
                k = RENAME_MAP[k]

            if k in DATE_KEYS:
                if not v:
                    return None
                v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD')

            return (k, v)

        val = common.transform_dict(value, cleaner)

        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        if val.get('product-type') == 'Drugs':
            val['upc'] = extract.extract_upc_from_recall(val)
            val['ndc'] = extract.extract_ndc_from_recall(val)

        # There is not a decent ID for the report, so we need to make one
        doc_id = self._hash(json.dumps(val, sort_keys=True))
        val['@id'] = doc_id
        val['@version'] = 1

        # Only write out vals that have required keys and a meaningful date
        if set(logic_keys).issubset(val) and val['report-date'] is not None:
            output.add(doc_id, val)
        else:
            logging.warn('Docuemnt is missing required fields. %s',
                         json.dumps(val, indent=2, sort_keys=True))
Пример #6
0
    def run(self):
        output_dir = self.output().path
        os.system('mkdir -p "%s"' % output_dir)
        current_event_files = glob.glob(self.input()[0].path + '/*.json')
        historic_event_files = glob.glob(self.input()[1].path + '/*.json')
        all_files = current_event_files + historic_event_files
        output_file = self.output().path + '/all_res.json'
        out = open(output_file, 'w')
        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        for filename in all_files:
            json_file = open(filename, 'r')
            for row in json_file:
                record = json.loads(row)
                if record['product-type'] == 'Drugs':
                    record['upc'] = extract.extract_upc_from_recall(record)
                    record['ndc'] = extract.extract_ndc_from_recall(record)
                # Only write out records that have required keys and a meaningful date
                if set(logic_keys).issubset(
                        record) and record['report-date'] != None:
                    out.write(json.dumps(record) + '\n')
Пример #7
0
    def map(self, key, value, output):
        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        for val in value['recall-number']:
            if val['product-type'] == 'Drugs':
                val['upc'] = extract.extract_upc_from_recall(val)
                val['ndc'] = extract.extract_ndc_from_recall(val)

            # Copy the recall-number attribute value to an actual field
            # The recall-number is not a reliable id, since it repeats
            val['recall-number'] = val['@id']

            # There is no good ID for the report, so we need to make one
            doc_id = self._hash(json.dumps(val, sort_keys=True))
            val['@id'] = doc_id
            val['@version'] = 1

            # Only write out vals that have required keys and a meaningful date
            if set(logic_keys).issubset(val) and val['report-date'] != None:
                output.add(doc_id, val)
            else:
                logging.warn('Docuemnt is missing required fields. %s',
                             json.dumps(val, indent=2, sort_keys=True))