예제 #1
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
            v = v.strip() if isinstance(v, str) else v
            if k in self.rename_map and v is not None and v != '':
                new_key = self.rename_map[k]
                if not (new_key == 'title' and v == '0'):
                    return (new_key, v)

        json = common.transform_dict(value, _cleaner)

        json['type'] = self.doc_lookup[json['type_id']]
        del json['type_id']

        # Convert date to format used throughout openFDA (yyyymmdd)
        json['date'] = arrow.get(json['date']).strftime("%Y%m%d")
        json['url'] = common.convert_unicode(json['url'])

        # Assign application number as the key, since all three drugs@FDA files can be joined by this key.
        key = build_submissions_key(json['application_number'], json)
        del json['application_number'], json['submission_number'], json[
            'submission_type']

        output.add(key, json)
예제 #2
0
파일: pipeline.py 프로젝트: ColMac/openfda
  def map(self, key, value, output):
    IGNORE = ['physicalstate', 'technicalmethod', 'targetarea']

    # Changing names to match the openFDA naming standard
    # key = source name, value = replacement name
    RENAME_MAP = {
      'productcode': 'product_code',
      'reviewcode': 'review_code',
      'regulationnumber': 'regulation_number',
      'devicename': 'device_name',
      'medicalspecialty': 'medical_specialty',
      'thirdpartyflag': 'third_party_flag',
      'gmpexemptflag': 'gmp_exempt_flag',
      'deviceclass': 'device_class'
    }

    MEDICAL_SPECIALTY = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE

    def _cleaner(k, v):
      ''' A helper function used for removing and renaming dictionary keys.
      '''
      k = k.lower()
      if k in IGNORE: return None
      if k in RENAME_MAP:
        k = RENAME_MAP[k]
      if k == 'medical_specialty':
        tk, tv = 'medical_specialty_description', MEDICAL_SPECIALTY[v]
        return [(k, v), (tk, tv)]
      return (k, v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #3
0
    def map(self, key, value, output):
        IGNORE = ['physicalstate', 'technicalmethod', 'targetarea']

        # Changing names to match the openFDA naming standard
        # key = source name, value = replacement name
        RENAME_MAP = {
            'productcode': 'product_code',
            'reviewcode': 'review_code',
            'regulationnumber': 'regulation_number',
            'devicename': 'device_name',
            'medicalspecialty': 'medical_specialty',
            'thirdpartyflag': 'third_party_flag',
            'gmpexemptflag': 'gmp_exempt_flag',
            'deviceclass': 'device_class'
        }

        MEDICAL_SPECIALTY = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE

        def _cleaner(k, v):
            ''' A helper function used for removing and renaming dictionary keys.
      '''
            k = k.lower()
            if k in IGNORE: return None
            if k in RENAME_MAP:
                k = RENAME_MAP[k]
            if k == 'medical_specialty':
                tk, tv = 'medical_specialty_description', MEDICAL_SPECIALTY[v]
                return [(k, v), (tk, tv)]
            return (k, v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(key, new_value)
예제 #4
0
    def reduce(self, key, values, output):
        # There are  lot of keys that we do not need once all the joining has been
        # done, so we can now transform the output and remove the join keys and
        # changes some of the names to be in line with naming conventions.

        IGNORE = [
            'key_val', 'address_id', 'address_type_id', 'contact_id',
            'reg_key', 'listing_prop_id', 'listing_prop_name_id',
            'registration_listing_id', 'establishment'
        ]

        RENAME_MAP = {
            'address_line1': 'address_1',
            'address_line2': 'address_2',
            'reg_status_id': 'status_code',
            'state_id': 'state_code'
        }

        def _prune(k, v):
            ''' A helper function used for removing and renaming dictionary keys.
      '''
            if k in IGNORE: return None
            if k in RENAME_MAP:
                k = RENAME_MAP[k]
            return (k, v)

        key_prefix = 'final_result:' + key
        val = self._join(values)
        if val:
            for i, row in enumerate(val):
                new_key = key_prefix + ':' + str(i)
                new_value = common.transform_dict(row, _prune)
                output.put(new_key, new_value)
예제 #5
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
            v = v.strip() if isinstance(v, str) else v
            if k in self.rename_map and v is not None and v != '':
                new_key = self.rename_map[k]
                if new_key in self.VALUE_MAPPINGS and v in self.VALUE_MAPPINGS[
                        new_key]:
                    v = self.VALUE_MAPPINGS[new_key][v]
                return (new_key, v)

        json = common.transform_dict(value, _cleaner)

        # Turn active ingredients into an array of objects as per the mapping.
        if json.get('active_ingredients'):
            ingredientList = re.sub(';\s+', ';',
                                    json['active_ingredients']).split(';')
            json['active_ingredients'] = []

            strengthList = re.sub(
                ';\s+', ';',
                json['strength']).split(';') if json.get('strength') else []

            for idx, name in enumerate(ingredientList):
                ingredient = {'name': name}
                if len(strengthList) > idx:
                    ingredient['strength'] = strengthList[idx]
                json['active_ingredients'].append(ingredient)
        else:
            # Delete to avoid complaints from Elasticsearch.
            if json.get('active_ingredients') is not None:
                del json['active_ingredients']

        if json.get('strength') is not None:
            del json['strength']

        # Split dosage and form into two distinct fields.
        if json.get('df_and_route') and len(
                json['df_and_route'].split(';')) == 2:
            json['dosage_form'] = json['df_and_route'].split(';')[0].strip()
            json['route'] = json['df_and_route'].split(';')[1].strip()
        # Sometimes the entire entry is Unknown. Indicate this for both df & route.
        elif json.get('df_and_route') and "UNKNOWN" in json['df_and_route']:
            json['dosage_form'] = json['df_and_route']
            json['route'] = json['df_and_route']
        # Sometimes the entire only contains dosage form.
        else:
            json['dosage_form'] = json['df_and_route']
            json['route'] = None

        # Delete the field either way
        del json['df_and_route']

        # Assign application number as the key, since all three drugs@FDA files can be joined by this key.
        key = build_products_key(json['application_number'], json)
        del json['application_number']

        output.add(key, json)
예제 #6
0
    def map(self, key, value, output):
        def cleaner(k, v):
            if k in RENAME_MAP:
                k = RENAME_MAP[k]

            if k in DATE_KEYS:
                if not v:
                    return None
                v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD')

            return (k, v)

        val = common.transform_dict(value, cleaner)

        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        if val.get('product-type') == 'Drugs':
            val['upc'] = extract.extract_upc_from_recall(val)
            val['ndc'] = extract.extract_ndc_from_recall(val)

        # There is not a decent ID for the report, so we need to make one
        doc_id = self._hash(json.dumps(val, sort_keys=True))
        val['@id'] = doc_id
        val['@version'] = 1

        # Only write out vals that have required keys and a meaningful date
        if set(logic_keys).issubset(val) and val['report-date'] is not None:
            output.add(doc_id, val)
        else:
            logging.warn('Docuemnt is missing required fields. %s',
                         json.dumps(val, indent=2, sort_keys=True))
예제 #7
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
            v = common.convert_unicode(v.strip()) if isinstance(v, str) else v
            if k in self.rename_map and v is not None and v != '':
                return (self.rename_map[k], v)

        json = common.transform_dict(value, _cleaner)

        if json.get('submission_class_code_id') and json.get(
                'submission_class_code_id') is not None:
            json['submission_class_code'] = self.doc_lookup[
                json['submission_class_code_id']][0]
            descr = self.doc_lookup[
                json['submission_class_code_id']][1].rstrip()
            if descr:
                json['submission_class_code_description'] = descr
            del json['submission_class_code_id']

        # Convert date to format used throughout openFDA (yyyymmdd)
        if json.get('submission_status_date'):
            json['submission_status_date'] = arrow.get(
                json['submission_status_date']).strftime("%Y%m%d")

        # Assign application number as the key, since all three drugs@FDA files can be joined by this key.
        key = build_submissions_key(json['application_number'], json)
        del json['application_number']

        output.add(key, json)
예제 #8
0
    def map(self, key, value, output):

        RENAME_MAP = {
            'res event number': 'res_event_number',
            'rcl products res number': 'product_res_number',
            'rcl firm fei number': 'firm_fei_number',
            'rcl event info date terminated': 'event_date_terminated',
            'rcl event info root cause description': 'root_cause_description',
            'rcl products product code': 'product_code',
            'rcl products submission numbers': 'product_submission_numbers'
        }

        # Uses renamed fields
        DATE_KEYS = ['event_date_terminated']

        def _cleaner(k, v):
            ''' A helper function that is used for renaming dictionary keys and
          formatting dates.
      '''
            def is_other(data):
                if not common.get_p_number(data) and \
                   not common.get_k_number(data) and \
                   not data.startswith('G'):
                    return True
                return False

            k = k.lower()
            if k in RENAME_MAP:
                k = RENAME_MAP[k]
            if k == 'product_submission_numbers':
                # We take this single key, split it on space and then return three new
                # keys and not the original.
                submissions = [s for s in v.split(' ') if s]
                k_numbers = [d for d in submissions if common.get_k_number(d)]
                pma_numbers = [
                    d for d in submissions if common.get_p_number(d)
                ]
                other = ' '.join(
                    [d.strip() for d in submissions if is_other(d)])
                return [('k_numbers', k_numbers), ('pma_numbers', pma_numbers),
                        ('other_submission_description', other)]

            if v != None:
                if k in DATE_KEYS:
                    # Elasticsearch cannot handle null dates, emit None so the wrapper
                    # function `transform_dict` will omit it from its transformation
                    if not v: return None
                    v = arrow.get(v,
                                  'YYYY/MM/DD HH:mm:ss').format('YYYY-MM-DD')
                else:
                    return (k, v.strip())
            return (k, v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(key, new_value)
예제 #9
0
파일: pipeline.py 프로젝트: ColMac/openfda
 def transform_dates(coll):
   DATE_KEYS = ['created_date']
   def _replace_date(k, v):
     if k is None: return
     k = k.lower()
     if v is None: return (k, v)
     if k in DATE_KEYS: return (k,  arrow.get(v, 'MM/DD/YYYY')\
                                         .format('YYYY-MM-DD'))
     if isinstance(v, list): return (k, v)
     return (k, v.strip())
   return common.transform_dict(coll, _replace_date)
예제 #10
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    if len(value) < 1:
      return

    mdr_key = value[0]

    # Some of the files have headers, we will apply our own, so row that starts
    # with this value is safe to skip
    if 'MDR_REPORT_KEY' in mdr_key:
      return

    file_type = [s for s in CATEGORIES if s in self.filename][0]
    # logging.info('file type: %s, file: %s', file_type, self.filename)

    # TODO(hansnelsen): remove once file format is stable
    if file_type == 'mdrfoi':
      value = self.shuffle_mdrfoi_value(value)

    # We send all data anomalies to a reducer for each file type.
    # These non-conforming data are written to a reject file for review.
    # This file type as variable lengths over time, so it needs its owne check
    if file_type == 'foidev':
      if len(value) not in [28, 45]:
        logging.info('Does not conform to foidev structure. Skipping: %s, %s',
          mdr_key, '#' * 200)
        output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
        return

    elif len(value) != len(FILE_HEADERS[file_type]):
      logging.info('Does not conform to %s structure. Skipping: %s, %s',
        file_type, mdr_key, '#' * 200)
      output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
      return

    if not isinstance(int(mdr_key), int):
      logging.info('%s is not a number', mdr_key)
      output.add(file_type, '%s: NaN' % mdr_key + ':' +  '|'.join(value))
      return

    # If it makes it this far, it is a good record
    new_value = dict(zip(FILE_HEADERS[file_type], value))
    new_value = common.transform_dict(new_value, self.cleaner)

    # https://github.com/FDA/openfda/issues/27
    # We need to see if device problem code is available for this report in the
    # foidevproblem.txt file, resolve it to a problem description, and add it to the
    # master record.
    if file_type == 'mdrfoi':
      problem_codes = self.device_problem_codes.get(mdr_key)
      if problem_codes is not None:
        product_problems = [self.problem_codes_reference.get(code) for code in problem_codes]
        new_value['product_problems'] = product_problems

    output.add(mdr_key, (file_type, new_value))
예제 #11
0
 def transform_dates(coll):
   DATE_KEYS = ['created_date']
   def _replace_date(k, v):
     if k is None: return
     k = k.lower()
     if v is None: return (k, v)
     if k in DATE_KEYS: return (k,  arrow.get(v, 'MM/DD/YYYY')
                                         .format('YYYY-MM-DD'))
     if isinstance(v, list): return (k, v)
     return (k, v.strip())
   return common.transform_dict(coll, _replace_date)
예제 #12
0
  def map(self, key, value, output):
    if len(value) < 1:
      return

    mdr_key = value[0]

    # Some of the files have headers, we will apply our own, so row that starts
    # with this value is safe to skip
    if 'MDR_REPORT_KEY' in mdr_key:
      return

    file_type = [s for s in CATEGORIES if s in self.filename][0]
    # logging.info('file type: %s, file: %s', file_type, self.filename)

    # TODO(hansnelsen): remove once file format is stable
    if file_type == 'mdrfoi':
      value = self.shuffle_mdrfoi_value(value)

    # We send all data anomalies to a reducer for each file type.
    # These non-conforming data are written to a reject file for review.
    # This file type as variable lengths over time, so it needs its owne check
    if file_type == 'foidev':
      if len(value) not in [28, 45]:
        logging.info('Does not conform to foidev structure. Skipping: %s, %s',
          mdr_key, '#' * 200)
        output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
        return

    elif len(value) != len(FILE_HEADERS[file_type]):
      logging.info('Does not conform to %s structure. Skipping: %s, %s',
        file_type, mdr_key, '#' * 200)
      output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
      return

    if not isinstance(int(mdr_key), int):
      logging.info('%s is not a number', mdr_key)
      output.add(file_type, '%s: NaN' % mdr_key + ':' +  '|'.join(value))
      return

    # If it makes it this far, it is a good record
    new_value = dict(zip(FILE_HEADERS[file_type], value))
    new_value = common.transform_dict(new_value, self.cleaner)

    # https://github.com/FDA/openfda/issues/27
    # We need to see if device problem code is available for this report in the
    # foidevproblem.txt file, resolve it to a problem description, and add it to the
    # master record.
    if file_type == 'mdrfoi':
      problem_codes = self.device_problem_codes.get(mdr_key)
      if problem_codes is not None:
        product_problems = [self.problem_codes_reference.get(code) for code in problem_codes]
        new_value['product_problems'] = product_problems

    output.add(mdr_key, (file_type, new_value))
예제 #13
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
            if k == 'PRODUCTID':
                v = v.split('_')[1]
            if k in self.rename_map:
                return (self.rename_map[k], v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(key, new_value)
예제 #14
0
파일: pipeline.py 프로젝트: ColMac/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      if k == 'PRODUCTID':
        v = v.split('_')[1]
      if k in self.rename_map:
        return (self.rename_map[k], v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #15
0
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      if k in self.booleans:
        v = v in ['Y', 'y'] if v is not None and v != '' else None
      v = v.strip() if isinstance(v, str) else v
      if k in self.rename_map and v is not None and v != '':
        return (self.rename_map[k], v)

    json = common.transform_dict(value, _cleaner)

    # SPL ID
    json['spl_id'] = json['product_id'].split('_')[1]

    # Brand name parsing
    json['brand_name_base'] = json.get('brand_name')
    if json.get('brand_name_suffix'):
      json['brand_name'] = (json.get('brand_name') if json.get('brand_name') is not None else '') + ' ' + json[
        'brand_name_suffix']

    # Route is a multi-value field easily parseable
    if json.get('route'):
      json['route'] = re.sub(';\s+', ';', json['route']).split(';')

    # Pharm class field is a multi-value field easily parseable
    if json.get('pharm_class'):
      json['pharm_class'] = re.sub(',\s+', ',', json['pharm_class']).split(',')

    # Turn active ingredients into an array of objects as per the mapping.
    if json.get('active_ingredients'):
      ingredientList = re.sub(';\s+', ';', json['active_ingredients']).split(';')
      json['active_ingredients'] = []

      strengthList = re.sub(';\s+', ';', json['strength']).split(';') if json.get('strength') else []
      unitList = re.sub(';\s+', ';', json['unit']).split(';') if json.get('unit') else []
      for idx, name in enumerate(ingredientList):
        ingredient = {'name': name}
        if len(strengthList) > idx:
          ingredient['strength'] = strengthList[idx] + (' ' + unitList[idx] if len(unitList) > idx else '')
        json['active_ingredients'].append(ingredient)
    else:
      # Delete to avoid complaints from Elasticsearch.
      if json.get('active_ingredients') is not None:
        del json['active_ingredients']

    if json.get('strength') is not None:
      del json['strength']
    if json.get('unit') is not None:
      del json['unit']

    output.add(key, json)
예제 #16
0
파일: pipeline.py 프로젝트: ColMac/openfda
  def map(self, key, value, output):


    RENAME_MAP = {
      'res event number': 'res_event_number',
      'rcl products res number': 'product_res_number',
      'rcl firm fei number': 'firm_fei_number',
      'rcl event info date terminated': 'event_date_terminated',
      'rcl event info root cause description': 'root_cause_description',
      'rcl products product code': 'product_code',
      'rcl products submission numbers': 'product_submission_numbers'
    }

    # Uses renamed fields
    DATE_KEYS = ['event_date_terminated']

    def _cleaner(k, v):
      ''' A helper function that is used for renaming dictionary keys and
          formatting dates.
      '''
      def is_other(data):
        if not common.get_p_number(data) and \
           not common.get_k_number(data) and \
           not data.startswith('G'):
          return True
        return False

      k = k.lower()
      if k in RENAME_MAP:
          k = RENAME_MAP[k]
      if k == 'product_submission_numbers':
        # We take this single key, split it on space and then return three new
        # keys and not the original.
        submissions = [s for s in v.split(' ') if s]
        k_numbers = [d for d in submissions if common.get_k_number(d)]
        pma_numbers = [d for d in submissions if common.get_p_number(d)]
        other = ' '.join([d.strip() for d in submissions if is_other(d)])
        return [('k_numbers', k_numbers),
                ('pma_numbers', pma_numbers),
                ('other_submission_description', other)]

      if v != None:
        if k in DATE_KEYS:
          # Elasticsearch cannot handle null dates, emit None so the wrapper
          # function `transform_dict` will omit it from its transformation
          if not v: return None
          v = arrow.get(v, 'YYYY/MM/DD HH:mm:ss').format('YYYY-MM-DD')
        else: return (k, v.strip())
      return (k, v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #17
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
            if k in self.booleans:
                v = v in ['Y', 'y'] if v is not None and v != '' else None
            v = v.strip() if isinstance(v, str) else v
            if k in self.rename_map and v is not None and v != '':
                return (self.rename_map[k], v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(key, new_value)
예제 #18
0
파일: pipeline.py 프로젝트: ColMac/openfda
 def map(self, key, value, output):
   keepers = ['rxcui', 'rxstring', 'rxtty','setid', 'spl_version']
   rename_map = {
     'setid': 'spl_set_id'
   }
   def _cleaner(k, v):
     k = k.lower()
     if k in keepers:
       if k in rename_map:
         return (rename_map[k], v)
       return (k, v)
   new_value = common.transform_dict(value, _cleaner)
   new_key = new_value['spl_set_id'] + ':'+ new_value['spl_version']
   output.add(new_key, new_value)
예제 #19
0
    def map(self, key, value, output):
        keepers = ['rxcui', 'rxstring', 'rxtty', 'setid', 'spl_version']
        rename_map = {'setid': 'spl_set_id'}

        def _cleaner(k, v):
            k = k.lower()
            if k in keepers:
                if k in rename_map:
                    return (rename_map[k], v)
                return (k, v)

        new_value = common.transform_dict(value, _cleaner)
        new_key = new_value['spl_set_id'] + ':' + new_value['spl_version']
        output.add(new_key, new_value)
예제 #20
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''

            if k in self.rename_map and v is not None and v != '':
                if "Date" in k:
                    return (self.rename_map[k], str(int(v)))
                if "Proprietary Name" in k:
                    return (self.rename_map[k], str(v).title())
                else:
                    return (self.rename_map[k], v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(key, new_value)
예제 #21
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      v = v.strip() if isinstance(v, str) else v
      if k in self.rename_map and v is not None and v != '' and v != 'Null':
        return (self.rename_map[k], v)

    json = common.transform_dict(value, _cleaner)

    # Assign application number as the key, since all three drugs@FDA files can be joined by this key.
    key = build_submissions_key(json['application_number'], json)
    del json['application_number'], json['submission_number'], json['submission_type']
    if json != {}:
      output.add(key, json)
예제 #22
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      if k in self.rename_map and v is not None:
        if "DATE" in k:
          return self.rename_map[k], str(arrow.get(v, 'M/D/YYYY').format("MM/DD/YYYY"))
        # Make arrays of these three fields.
        if k in ["TOBACCO_PRODUCTS", "REPORTED_HEALTH_PROBLEMS", "REPORTED_PRODUCT_PROBLEMS"]:
          return self.rename_map[k], list(set(v.split(' / ')))
        else:
          return self.rename_map[k], v

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #23
0
파일: pipeline.py 프로젝트: ColMac/openfda
  def reduce(self, key, values, output):
    # There are  lot of keys that we do not need once all the joining has been
    # done, so we can now transform the output and remove the join keys and
    # changes some of the names to be in line with naming conventions.

    IGNORE = ['key_val',
      'address_id',
      'address_type_id',
      'contact_id',
      'reg_key',
      'listing_prop_id',
      'listing_prop_name_id',
      'registration_listing_id',
      'establishment'
    ]

    RENAME_MAP = {
      'address_line1': 'address_1',
      'address_line2': 'address_2',
      'reg_status_id': 'status_code',
      'state_id': 'state_code'
    }

    EXPANSION_MAP = {
      'establishment_type': 'establishment_type_exact',
      'proprietary_name': 'proprietary_name_exact'
    }

    def _prune(k, v):
      ''' A helper function used for removing and renaming dictionary keys.
      '''
      if k in IGNORE: return None
      if k in RENAME_MAP:
        k = RENAME_MAP[k]
      if k in EXPANSION_MAP:
        ek, ev = EXPANSION_MAP[k], v
        return [(k, v), (ek, ev)]
      return (k, v)

    key_prefix = 'final_result:' + key
    val = self._join(values)
    if val:
      for i, row in enumerate(val):
        new_key = key_prefix + ':' + str(i)
        new_value = common.transform_dict(row, _prune)
        output.put(new_key, new_value)
예제 #24
0
  def map(self, key, value, output):

    # Date fields.
    DATE_KEYS = ['date_performed']

    def _cleaner(k, v):
      ''' A helper function that is used formatting dates.
      '''
      if v != None:
        if k in DATE_KEYS:
          # Elasticsearch cannot handle null dates, emit None so the wrapper
          # function `transform_dict` will omit it from its transformation
          v = arrow.get(v).format('M/D/YYYY')
      return (k, v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #25
0
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      # See https://github.com/FDA/openfda-dev/issues/6
      # Handle bad Unicode characters that may come fron NDC product.txt.
      if isinstance(v, str):
        v = unicode(v, 'utf8', 'ignore').encode()

      if k == 'PRODUCTID':
        v = v.split('_')[1]
      if k in self.rename_map:
        return (self.rename_map[k], v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #26
0
    def reduce(self, key, values, output):
        # There are  lot of keys that we do not need once all the joining has been
        # done, so we can now transform the output and remove the join keys and
        # changes some of the names to be in line with naming conventions.

        IGNORE = [
            "key_val",
            "address_id",
            "address_type_id",
            "contact_id",
            "reg_key",
            "listing_prop_id",
            "listing_prop_name_id",
            "registration_listing_id",
            "establishment",
        ]

        RENAME_MAP = {
            "address_line1": "address_1",
            "address_line2": "address_2",
            "reg_status_id": "status_code",
            "state_id": "state_code",
        }

        EXPANSION_MAP = {"establishment_type": "establishment_type_exact", "proprietary_name": "proprietary_name_exact"}

        def _prune(k, v):
            """ A helper function used for removing and renaming dictionary keys.
      """
            if k in IGNORE:
                return None
            if k in RENAME_MAP:
                k = RENAME_MAP[k]
            if k in EXPANSION_MAP:
                ek, ev = EXPANSION_MAP[k], v
                return [(k, v), (ek, ev)]
            return (k, v)

        key_prefix = "final_result:" + key
        val = self._join(values)
        if val:
            for i, row in enumerate(val):
                new_key = key_prefix + ":" + str(i)
                new_value = common.transform_dict(row, _prune)
                output.put(new_key, new_value)
예제 #27
0
    def map(self, key, value, output):

        RENAME_MAP = {
            'rcl products res number': 'product_res_number',
            'rcl firm fei number': 'firm_fei_number',
            'rcl products submission numbers': 'product_submission_numbers'
        }

        def _cleaner(k, v):
            ''' A helper function that is used for renaming dictionary keys and
          formatting dates.
      '''
            def is_other(data):
                if not common.get_p_number(data) and \
                  not common.get_k_number(data) and \
                  not data.startswith('G'):
                    return True
                return False

            k = k.lower()
            if k in RENAME_MAP:
                k = RENAME_MAP[k]
            else:
                return None

            if k == 'product_submission_numbers':
                # We take this single key, split it on space and then return three new
                # keys and not the original.
                submissions = [s for s in v.split(' ') if s]
                k_numbers = [d for d in submissions if common.get_k_number(d)]
                pma_numbers = [
                    d for d in submissions if common.get_p_number(d)
                ]
                other = ' '.join(
                    [d.strip() for d in submissions if is_other(d)])
                return [('k_numbers', k_numbers), ('pma_numbers', pma_numbers),
                        ('other_submission_description', other)]

            if v != None:
                return (k, v.strip())
            return (k, v)

        new_value = common.transform_dict(value, _cleaner)
        output.add(new_value['product_res_number'], new_value)
예제 #28
0
  def map(self, key, value, output):
    if len(value) < 1:
      return

    mdr_key = value[0]

    # Some of the files have headers, we will apply our own, so row that starts
    # with this value is safe to skip
    if 'MDR_REPORT_KEY' in mdr_key:
      return

    file_type = [s for s in CATEGORIES if s in self.filename][0]
    # logging.info('file type: %s, file: %s', file_type, self.filename)

    # TODO(hansnelsen): remove once file format is stable
    if file_type == 'mdrfoi':
      value = self.shuffle_mdrfoi_value(value)

    # We send all data anomalies to a reducer for each file type.
    # These non-conforming data are written to a reject file for review.
    # This file type as variable lengths over time, so it needs its owne check
    if file_type == 'foidev':
      if len(value) not in [28, 45]:
        logging.info('Does not conform to foidev structure. Skipping: %s, %s',
          mdr_key, '#' * 200)
        output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
        return

    elif len(value) != len(FILE_HEADERS[file_type]):
      logging.info('Does not conform to %s structure. Skipping: %s, %s',
        file_type, mdr_key, '#' * 200)
      output.add(file_type, '%s: missing fields' % mdr_key + ':' +  '|'.join(value))
      return

    if not isinstance(int(mdr_key), int):
      logging.info('%s is not a number', mdr_key)
      output.add(file_type, '%s: NaN' % mdr_key + ':' +  '|'.join(value))
      return

    # If it makes it this far, it is a good record
    new_value = dict(zip(FILE_HEADERS[file_type], value))
    new_value = common.transform_dict(new_value, self.cleaner)

    output.add(mdr_key, (file_type, new_value))
예제 #29
0
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      # See https://github.com/FDA/openfda-dev/issues/6
      # Handle bad Unicode characters that may come fron NDC product.txt.
      if isinstance(v, str):
        v = unicode(v, 'utf8', 'ignore').encode()

      if k in self.rename_map and v is not None:
        if "Date" in k:
          return (self.rename_map[k], str(int(v)))
        if "Proprietary Name" in k:
          return (self.rename_map[k], v.title())
        else:
          return (self.rename_map[k], v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #30
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      v = v.strip() if isinstance(v, str) else v
      if k in self.rename_map and v is not None and v != '':
        return (self.rename_map[k], v)

    json = common.transform_dict(value, _cleaner)

    if json.get('application_public_notes') != None:
      del json['application_public_notes']

    if json.get('application_no') and json.get('application_type'):
      json['application_number'] = json.get('application_type') + json.get('application_no')
      del json['application_type']
      del json['application_no']

    output.add(key, json)
예제 #31
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      # See https://github.com/FDA/openfda-dev/issues/6
      # Handle bad Unicode characters that may come fron NDC product.txt.
      if isinstance(v, str):
        v = unicode(v, 'utf8', 'ignore').encode()

      if k in self.rename_map and v is not None:
        if "Date" in k:
          return (self.rename_map[k], str(int(v)))
        if "Proprietary Name" in k:
          return (self.rename_map[k], v.title())
        else:
          return (self.rename_map[k], v)

    new_value = common.transform_dict(value, _cleaner)
    output.add(key, new_value)
예제 #32
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    def _cleaner(k, v):
      ''' Helper function to rename keys and purge any keys that are not in
          the map.
      '''
      v = v.strip() if isinstance(v, str) else v
      if k in self.rename_map and v is not None and v != '':
        return (self.rename_map[k], v)

    json = common.transform_dict(value, _cleaner)

    if json.get('marketing_status_id'):
      json['marketing_status'] = self.doc_lookup[json['marketing_status_id']]
      del json['marketing_status_id']

    # Assign application number as the key, since all three drugs@FDA files can be joined by this key.
    key = build_products_key(json['application_number'], json)
    del json['application_number'], json['product_number']

    output.add(key, json)
예제 #33
0
파일: pipeline.py 프로젝트: FDA/openfda
        def handle_device(_, xml):
            """Transform the dictionary, which follows the structure of UDI XML files, to a new one that
        matches the UDI mapping in ES.
      """

            try:

                device = common.transform_dict(xml, transformer_fn)

                # Additional transformation of the dictionary not handled by the transformer_fn function.
                if "id" in device:
                    del device["id"]
                if "device_sizes" in device:
                    for size in device["device_sizes"]:
                        if "value" in size:
                            value_dict = size["value"]
                            del size["value"]
                            if "value" in value_dict:
                                size["value"] = value_dict["value"]
                            if "unit" in value_dict:
                                size["unit"] = value_dict["unit"]

                # print(json.dumps(device, indent=4 * ' '))

                # @id will be a concatenation of primary identifier's issuer with the ID number itself.
                for identifier, idenList in iter(xml["identifiers"].items()):
                    if type(idenList) == type([]):
                        for iden in idenList:
                            if 'deviceIdType' in iden and 'deviceIdIssuingAgency' in iden and 'Primary' == iden[
                                    'deviceIdType']:
                                device["@id"] = (('' if 'deviceIdIssuingAgency' not in iden else iden['deviceIdIssuingAgency'] + "_") + \
                                                 iden["deviceId"]).lower()

                map_output.add(device["@id"], device)
                return True
            except Exception:
                logging.error(xml)
                traceback.print_exc()
                logging.error(sys.exc_info()[0])
                raise
예제 #34
0
파일: pipeline.py 프로젝트: FDA/openfda
    def handle_device(_, xml):
      """Transform the dictionary, which follows the structure of UDI XML files, to a new one that
        matches the UDI mapping in ES.
      """

      try:

        device = common.transform_dict(xml, transformer_fn)

        # Additional transformation of the dictionary not handled by the transformer_fn function.
        if "id" in device:
          del device["id"]
        if "device_sizes" in device:
          for size in device["device_sizes"]:
            if "value" in size:
              value_dict = size["value"]
              del size["value"]
              if "value" in value_dict:
                size["value"] = value_dict["value"]
              if "unit" in value_dict:
                size["unit"] = value_dict["unit"]

        # print(json.dumps(device, indent=4 * ' '))

        # @id will be a concatenation of primary identifier's issuer with the ID number itself.
        for identifier, idenList in xml["identifiers"].iteritems():
          if type(idenList) == type([]):
            for iden in idenList:
              if 'deviceIdType' in iden and 'deviceIdIssuingAgency' in iden and 'Primary' == iden['deviceIdType']:
                device["@id"] = (('' if 'deviceIdIssuingAgency' not in iden else iden['deviceIdIssuingAgency'] + "_") + \
                                 iden["deviceId"]).lower()

        map_output.add(device["@id"], device)
        return True
      except Exception:
        logging.error(xml)
        traceback.print_exc()
        logging.error(sys.exc_info()[0])
        raise
예제 #35
0
  def map(self, key, value, output):
    new_value = common.transform_dict(value, self.cleaner)
    new_key = new_value['report_number']

    output.add(new_key, new_value)
예제 #36
0
파일: transform.py 프로젝트: ColMac/openfda
def transform_device_pma(csv_dict):
  return common.transform_dict(csv_dict, _cleaner)
예제 #37
0
def transform_device_clearance(csv_dict):
    return common.transform_dict(csv_dict, _cleaner)
예제 #38
0
파일: pipeline.py 프로젝트: FDA/openfda
def omit_internal_keys(data):
  ''' Cleaner function to pass to the dump_index command and is used as a
     json.load(..., object_hook=omit_internal_keys).
  '''
  return common.transform_dict(data, basic_cleaner)
예제 #39
0
def omit_internal_keys(data):
    ''' Cleaner function to pass to the dump_index command and is used as a
     json.load(..., object_hook=omit_internal_keys).
  '''
    return common.transform_dict(data, basic_cleaner)
예제 #40
0
파일: pipeline.py 프로젝트: FDA/openfda
  def map(self, key, value, output):
    new_value = common.transform_dict(value, self.cleaner)
    new_key = new_value['report_number']

    output.add(new_key, new_value)
예제 #41
0
    def map(self, key, value, output):
        def _cleaner(k, v):
            if v == None:
                return None

            if k == 'recall_status':
                fullmatch = self.TERMINATED_RE.fullmatch(v)
                if fullmatch:
                    return [('recall_status', 'Terminated'),
                            ('event_date_terminated',
                             self.reformat_date(fullmatch.group(1)))]

            if k == 'recalling_firm':
                return parse_address(v)

            return (k, v)

        def parse_address(addr):
            '''
      Attempt to parse the Recalling Firm/Manufacturer piece into firm name and address components using
      usaddress library. If unable to parse, stuff everything back into the 'recalling_firm' element.
      '''
            def _concat(addr, part_keys):
                parts = []
                for key in part_keys:
                    if addr.get(key) is not None:
                        parts.append(addr.get(key))
                return ' '.join(parts)

            if len(addr.splitlines()) >= 2:
                try:
                    tagged = usaddress.tag(addr.replace('\n', ' '))
                    return [
                        x for x in
                        [('recalling_firm', tagged[0].get('Recipient')),
                         ('address_1',
                          _concat(tagged[0], [
                              'AddressNumberPrefix', 'AddressNumber',
                              'AddressNumberSuffix', 'StreetNamePreModifier',
                              'StreetNamePreDirectional', 'StreetNamePreType',
                              'StreetName', 'StreetNamePostType',
                              'StreetNamePostDirectional', 'USPSBoxType',
                              'USPSBoxID', 'USPSBoxGroupType', 'USPSBoxGroupID'
                          ])),
                         ('address_2',
                          _concat(tagged[0], [
                              'SubaddressType', 'SubaddressIdentifier',
                              'BuildingName', 'OccupancyType',
                              'OccupancyIdentifier', 'CornerOf',
                              'LandmarkName', 'IntersectionSeparator',
                              'NotAddress'
                          ])), ('city', tagged[0].get('PlaceName')
                                ), ('state', tagged[0].get('StateName')),
                         ('postal_code', tagged[0].get('ZipCode')
                          ), ('country', tagged[0].get('CountryName'))]
                        if x[1] is not None and x[1] != ''
                    ]
                except usaddress.RepeatedLabelError:
                    pass

            return 'recalling_firm', addr

        id = value['id']
        recall = {}
        url = RECALL_RECORD_DOWNLOAD_URL % (id)
        soup = soup_with_retry(url)

        recall['@id'] = id
        recall_number = self.field_val_str(soup, 'Recall Number')
        recall['product_res_number'] = recall_number
        recall['event_date_initiated'] = self.field_val_date(
            soup, 'Date Initiated by Firm')
        recall['event_date_created'] = self.field_val_date(soup, 'Create Date')
        recall['event_date_posted'] = self.field_val_date(soup, 'Date Posted')
        recall['recall_status'] = self.field_val_str(soup, 'Recall Status1')
        recall['res_event_number'] = self.field_val_str(
            soup, 'Recall Event ID')

        product_code = self.field_val_str(soup, 'Product Classification')
        if product_code:
            recall['product_code'] = re.search(r'Product Code\s+(\S+)',
                                               product_code).group(1).replace(
                                                   'ooo', 'N/A')

        recall['k_numbers'] = self.field_val_array(soup, '510(K)Number')
        recall['pma_numbers'] = self.field_val_array(soup, 'PMA Number')
        recall['product_description'] = self.field_val_str(soup, 'Product')
        recall['code_info'] = self.field_val_str(soup, 'Code Information')
        recall['recalling_firm'] = self.field_val_str(
            soup, 'Recalling Firm/Manufacturer')
        recall['additional_info_contact'] = self.field_val_str(
            soup, 'For Additional Information Contact')
        recall['reason_for_recall'] = self.field_val_str(
            soup, 'Manufacturer Reasonfor Recall')
        recall['root_cause_description'] = self.field_val_str(
            soup, 'FDA DeterminedCause 2')
        recall['action'] = self.field_val_str(soup, 'Action')
        recall['product_quantity'] = self.field_val_str(
            soup, 'Quantity in Commerce')
        recall['distribution_pattern'] = self.field_val_str(
            soup, 'Distribution')

        # Add data from the "old" CSV file.
        csv_record = self.csv2json_db.get(recall_number, None)
        if csv_record is not None:
            if len(csv_record['firm_fei_number']) > 1:
                recall['firm_fei_number'] = csv_record['firm_fei_number']
            if len(csv_record['other_submission_description']) > 0:
                recall['other_submission_description'] = csv_record[
                    'other_submission_description']
            if recall.get('k_numbers') is None and len(
                    csv_record.get('k_numbers')) > 0:
                recall['k_numbers'] = csv_record['k_numbers']
            if recall.get('pma_numbers') is None and len(
                    csv_record.get('pma_numbers')) > 0:
                recall['pma_numbers'] = csv_record['pma_numbers']

        xformed_recall = common.transform_dict(recall, _cleaner)
        output.add(key, xformed_recall)