예제 #1
0
class BuildDrugCanon():
    ''' Determine which package or product NDC is considered _the_ 
    representitive for the same proprietary name
    '''
    def __init__(self):
        self.sourceNames = BuildNdcWhiteList()
        self.sourceLabels = AcquireOpenFda()

    def __str__(self):
        return 'Determine Canonical Drugs'
    # -------------------------------------------------------------------------

    def _map(self, label):
        ''' A generator function that produces multiple elements from each 
        OpenFDA record
        '''
        if ('openfda' in label and 'product_ndc' in label['openfda'] and
            'brand_name' in label['openfda']):
            asString = json.dumps(label)
            for x in label['openfda']['brand_name']:
                brand = self.sourceNames.title(x)
                for y in label['openfda']['product_ndc']:
                    ndc = ProductNdc.parse(y).format()
                    yield {'proprietary_name': brand, 
                           'ndc':ndc, 
                           'size':len(asString)}

    def _reduce(self, dicts):
        ''' Selects the "best" element from a list  
        '''
        if len(dicts) < 1:
            return None

        best = dicts[0]
        for i in range(1, len(dicts)):
            if best['size'] < dicts[i]['size']:
                best = dicts[i]
        return best

    # -------------------------------------------------------------------------

    def _mapLabels(self):
        ''' A generator function that internally calls `map` for each label
        '''
        for record in self.sourceLabels.acquire_labels():
            for node in self._map(record):
                yield node

    def _reduceToCanon(self, partitions):
        ''' A generator that internally calls `reduce` for each entry
        '''
        for name in sorted(partitions):
            result = self._reduce(partitions[name])
            if result:
                yield (name, result['ndc'])

    # -------------------------------------------------------------------------

    def run(self):
        ''' Use the size of a record in the FDA data set to determine which 
        package or product NDC is considered _the_ representitive for the same
        proprietary name
        '''
        print('Loading White List')
        whiteListFileName = io.relativeToAbsolute('../../data/product_ndc.txt')
        records = []
        with open(whiteListFileName) as f:
            for row in csv.DictReader(f, dialect=csv.excel_tab):
                # for some reason a weird 'None' column appears
                records.append({k:v for k,v in row.items() if k})

        partitions = {x['proprietary_name']: [] for x in records}
        products = {x['product_ndc'] for x in records if x['proprietary_name']}

        print('Mapping Labels')
        for node in self._mapLabels():
            nameKey = node['proprietary_name']
            prodKey = node['ndc']
            if nameKey in partitions and prodKey in products:
                partitions[nameKey].append(node)

        print('Reducing to Canon')
        outFileName = io.relativeToAbsolute('../../data/canon_drugs.txt')
        canon = {x for x in self._reduceToCanon(partitions)}

        print('Updating NDC Whitelist')
        for row in records:
            tuple = (row['proprietary_name'], row['product_ndc'])
            if tuple in canon:
                # consume because multiple package codes map to this key
                canon.remove(tuple) 
                row['is_canon'] = 'true'
            else:
                row['is_canon'] = 'false'

        print('Saving')
        tempName = io.relativeToAbsolute('../../data/product_ndc_canon.txt')
        io.saveAsTabbedText(records, '../../data/product_ndc_canon.txt')

        # no errors, rename
        os.remove(whiteListFileName)
        os.rename(tempName, whiteListFileName)
예제 #2
0
 def __init__(self):
     self.sourceNames = BuildNdcWhiteList()
     self.sourceLabels = AcquireOpenFda()
예제 #3
0
class ExtractOpenFdaFeatures():
    ''' Extract attributes from the OpenFDA label data set and perform some
    initial cleaning of the data
    '''
    def __init__(self):
        self.source = AcquireOpenFda()
        self.whiteList = BuildNdcWhiteList()
        self.features = self._buildFeatureSet()

    def __str__(self):
        return 'Extract Features from OpenFDA Labels'

    # -------------------------------------------------------------------------

    def _buildFeatureSet(self):
        ''' Create the list of features to extract
        '''
        return [{
            'feature': Feature('openfda.manufacturer_name'),
            'column': 'name',
            'transform': []
        }, {
            'feature': Feature('openfda.pharm_class_cs'),
            'column': 'class_name',
            'transform': [self.whiteList.title, self.trimPharmClassCs]
        }, {
            'feature': Feature('openfda.pharm_class_epc'),
            'column': 'class_name',
            'transform': [self.whiteList.title, self.trimPharmClass3]
        }, {
            'feature': Feature('openfda.pharm_class_moa'),
            'column': 'class_name',
            'transform': [self.whiteList.title, self.trimPharmClass3]
        }, {
            'feature': Feature('openfda.pharm_class_pe'),
            'column': 'class_name',
            'transform': [self.whiteList.title, self.trimPharmClass2]
        }, {
            'feature': Feature('openfda.product_type'),
            'column': 'type_name',
            'transform': [self.titleCaseIgnoreSmall]
        }, {
            'feature': Feature('openfda.route'),
            'column': 'route',
            'transform': [self.whiteList.title]
        }, {
            'feature': Feature('openfda.substance_name'),
            'column': 'name',
            'transform': [self.titleCaseIgnoreSmall]
        }, {
            'feature': Feature('openfda.brand_name'),
            'column': 'name',
            'transform': [self.titleCaseIgnoreSmall]
        }, {
            'feature': Feature('openfda.generic_name'),
            'column': 'name',
            'transform': [self.titleCaseIgnoreSmall]
        }, {
            'feature': Feature('active_ingredient'),
            'column': 'text',
            'transform': []
        }, {
            'feature': Feature('inactive_ingredient'),
            'column': 'text',
            'transform': []
        }]

    # -------------------------------------------------------------------------
    # Monad-like functions for processing strings
    #

    def trimPharmClassCs(self, s):
        execute = '[chemical/ingredient]' in s
        return s[:-22] if execute else s

    def trimPharmClass2(self, s):
        return s[:-5] if s[-1] == ']' else s

    def trimPharmClass3(self, s):
        return s[:-6] if s[-1] == ']' else s

    def titleCaseIgnoreSmall(self, s):
        word_list = re.split(' ', s)
        final = [word_list[0].capitalize()]
        for word in word_list[1:]:
            final.append(word if len(word) < 4 else word.capitalize())
        return " ".join(final)

    # -------------------------------------------------------------------------

    def run(self):
        ''' 
        Make a key-value map of certain attributes in the Open FDA dataset
        '''
        print('Acquiring Records')
        for record in self.source.acquire_labels():
            if 'openfda' in record and 'product_ndc' in record['openfda']:
                for entry in record['openfda']['product_ndc']:
                    ndc = ProductNdc.parse(entry)
                    id = ndc.format()
                    for op in self.features:
                        op['feature'].accumulate(id, record)

        print('Writing Features')
        for op in self.features:
            feature = op['feature']
            baseName = '-'.join(feature.fields)
            fileName = io.relativeToAbsolute('../../data/' + baseName + '.txt')

            with open(fileName, 'w', encoding='utf-8') as f:
                print('product_ndc', op['column'], sep='\t', file=f)
                for pair in sorted(feature.data, key=itemgetter(0, 1)):
                    value = pair[1]
                    for fn in op['transform']:
                        value = fn(value)
                    print(pair[0], value, sep='\t', file=f)
예제 #4
0
 def __init__(self):
     self.source = AcquireOpenFda()
     self.whiteList = BuildNdcWhiteList()
     self.features = self._buildFeatureSet()
class ExtractOpenFdaFeatures():
    ''' Extract attributes from the OpenFDA label data set and perform some
    initial cleaning of the data
    '''
    def __init__(self):
        self.source = AcquireOpenFda()
        self.whiteList = BuildNdcWhiteList()
        self.features = self._buildFeatureSet()

    def __str__(self):
        return 'Extract Features from OpenFDA Labels'

    # -------------------------------------------------------------------------
    
    def _buildFeatureSet(self):
        ''' Create the list of features to extract
        '''
        return [{'feature':Feature('openfda.manufacturer_name'), 
                 'column': 'name',
                 'transform': []},
                {'feature':Feature('openfda.pharm_class_cs'),
                 'column': 'class_name',
                 'transform': [self.whiteList.title, self.trimPharmClassCs]},
                {'feature':Feature('openfda.pharm_class_epc'),
                 'column': 'class_name',
                 'transform': [self.whiteList.title, self.trimPharmClass3]},
                {'feature':Feature('openfda.pharm_class_moa'),
                 'column': 'class_name',
                 'transform': [self.whiteList.title, self.trimPharmClass3]},
                {'feature':Feature('openfda.pharm_class_pe'),
                 'column': 'class_name',
                 'transform': [self.whiteList.title, self.trimPharmClass2]},
                {'feature':Feature('openfda.product_type'),
                 'column': 'type_name',
                 'transform': [self.titleCaseIgnoreSmall]},
                {'feature':Feature('openfda.route'),
                 'column': 'route',
                 'transform': [self.whiteList.title]},
                {'feature':Feature('openfda.substance_name'),
                 'column': 'name',
                 'transform': [self.titleCaseIgnoreSmall]},
                {'feature':Feature('openfda.brand_name'),
                 'column': 'name',
                 'transform': [self.titleCaseIgnoreSmall]},
                {'feature':Feature('openfda.generic_name'),
                 'column': 'name',
                 'transform': [self.titleCaseIgnoreSmall]},
                {'feature':Feature('active_ingredient'),
                 'column': 'text',
                 'transform': []},
                {'feature':Feature('inactive_ingredient'),
                 'column': 'text',
                 'transform': []}
                 ]

    # -------------------------------------------------------------------------
    # Monad-like functions for processing strings
    # 

    def trimPharmClassCs(self, s):
        execute = '[chemical/ingredient]' in s
        return s[:-22] if execute else s

    def trimPharmClass2(self, s):
        return s[:-5] if s[-1] == ']' else s

    def trimPharmClass3(self, s):
        return s[:-6] if s[-1] == ']' else s

    def titleCaseIgnoreSmall(self, s):
        word_list = re.split(' ', s)
        final = [word_list[0].capitalize()]
        for word in word_list[1:]:
            final.append(word if len(word) < 4 else word.capitalize())
        return " ".join(final)

    # -------------------------------------------------------------------------

    def run(self):
        ''' 
        Make a key-value map of certain attributes in the Open FDA dataset
        '''
        print('Acquiring Records')
        for record in self.source.acquire_labels():
            if 'openfda' in record and 'product_ndc' in record['openfda']:
                for entry in record['openfda']['product_ndc']:
                    ndc = ProductNdc.parse(entry)
                    id = ndc.format()
                    for op in self.features:
                        op['feature'].accumulate(id, record)
        
        print('Writing Features')
        for op in self.features:
            feature = op['feature']
            baseName = '-'.join(feature.fields)
            fileName = io.relativeToAbsolute('../../data/'+baseName+'.txt')

            with open(fileName, 'w', encoding='utf-8') as f:
                print('product_ndc', op['column'], sep='\t', file=f)
                for pair in sorted(feature.data, key=itemgetter(0, 1)):
                    value = pair[1]
                    for fn in op['transform']:
                        value = fn(value)
                    print(pair[0],value,sep='\t',file=f)
 def __init__(self):
     self.source = AcquireOpenFda()
     self.whiteList = BuildNdcWhiteList()
     self.features = self._buildFeatureSet()