def test_pop_and_2_obs_with_all_pv(self): """Use separate templates for Pop Obs, and use Obs template repeatedly.""" templater = mcf_template_filler.Filler(POP_TEMPLATE, required_vars=['geo_id']) template_vars = { 'geo_id': 'geoId/06', 'naics_code': '11', 'operation_type': 'Manufacturer', 'tax_status': 'ExemptFromTax' } result = templater.fill(template_vars) expected = """ Node: Pop_payroll_est_geoId/06_11_Manufacturer_ExemptFromTax typeOf: schema:StatisticalPopulation populationType: dcs:USCEstablishment location: dcid:geoId/06 payrollStatus: dcs:WithPayroll naics: dcs:NAICS/11 operationType: dcs:Manufacturer taxStatus: dcs:ExemptFromTax """ self.assertEqual(result, expected) templater = mcf_template_filler.Filler( OBS_TEMPLATE, required_vars=['year', 'mprop', 'mval']) template_vars['year'] = '2000' template_vars['mprop'] = 'count' template_vars['mval'] = 0 result = templater.fill(template_vars) expected = """ Node: Obs_on_Pop_payroll_est_geoId/06_11_Manufacturer_ExemptFromTax_2000_count typeOf: schema:Observation observedNode: l:Pop_payroll_est_geoId/06_11_Manufacturer_ExemptFromTax observationDate: "2000" measuredProperty: dcs:count measuredValue: 0 """ self.assertEqual(result, expected) template_vars['year'] = '2001' template_vars['mprop'] = 'count' template_vars['mval'] = 144 result = templater.fill(template_vars) expected = """ Node: Obs_on_Pop_payroll_est_geoId/06_11_Manufacturer_ExemptFromTax_2001_count typeOf: schema:Observation observedNode: l:Pop_payroll_est_geoId/06_11_Manufacturer_ExemptFromTax observationDate: "2001" measuredProperty: dcs:count measuredValue: 144 """ self.assertEqual(result, expected)
def test_require_node_name(self): with self.assertRaises(ValueError): mcf_template_filler.Filler(NAMELESS_POP_TEMPLATE) with self.assertRaises(ValueError): mcf_template_filler.Filler(NAMELESS_OBS_TEMPLATE) bad_node = """ typeOf: badNode location: dcid:badPlace """ with self.assertRaises(ValueError): mcf_template_filler.Filler(POP_TEMPLATE + bad_node)
def test_example_usage(self): example_template = """ Node: People_in_geoId_{geo_id}_{race}_{gender}_{random_field} typeOf: schema:StatisticalPopulation populationType: schema:Person location: geoId/{geo_id} race: dcs:{race} gender: dcs:{gender} randomOptionalProperty: {random_field} """ templater = mcf_template_filler.Filler(example_template, required_vars=['geo_id']) var_dict1 = {'geo_id': '05', 'race': 'White'} pop1 = templater.fill(var_dict1) expected = """ Node: People_in_geoId_05_White__ typeOf: schema:StatisticalPopulation populationType: schema:Person location: geoId/05 race: dcs:White """ self.assertEqual(pop1, expected) var_dict2 = {'geo_id': '05', 'gender': 'Female'} pop2 = templater.fill(var_dict2) expected = """ Node: People_in_geoId_05__Female_ typeOf: schema:StatisticalPopulation populationType: schema:Person location: geoId/05 gender: dcs:Female """ self.assertEqual(pop2, expected)
def test_unified_pop_obs_with_missing_optional_pv(self): # Can combine templates, like Pop + Obs pop_obs_template = POP_TEMPLATE + OBS_TEMPLATE templater = mcf_template_filler.Filler( pop_obs_template, required_vars=['geo_id', 'year', 'mprop', 'mval']) template_vars = { 'geo_id': 'geoId/06', 'naics_code': '11', 'tax_status': 'ExemptFromTax', 'year': '2000', 'mprop': 'count', 'mval': 42, } result = templater.fill(template_vars) expected = """ Node: Pop_payroll_est_geoId/06_11__ExemptFromTax typeOf: schema:StatisticalPopulation populationType: dcs:USCEstablishment location: dcid:geoId/06 payrollStatus: dcs:WithPayroll naics: dcs:NAICS/11 taxStatus: dcs:ExemptFromTax Node: Obs_on_Pop_payroll_est_geoId/06_11__ExemptFromTax_2000_count typeOf: schema:Observation observedNode: l:Pop_payroll_est_geoId/06_11__ExemptFromTax observationDate: "2000" measuredProperty: dcs:count measuredValue: 42 """ self.assertEqual(result, expected)
def test_pop_with_missing_req_pv(self): templater = mcf_template_filler.Filler( POP_TEMPLATE, required_vars=['geo_id', 'tax_status']) template_vars = { 'geo_id': 'geoId/06', 'naics_code': '11', 'operation_type': 'Manufacturer', } with self.assertRaises(ValueError): templater.fill(template_vars)
def zip_ingred_semi_sep(mcf_file, strength_format_map, row): """Zips ingredients and strengths together when the ingredients are separated by a semi colon in the strengths column. Ex: strengths: 1 mg, 2mg, 3mg; 4mg, 5mg, 6mg ingredients: ingred1 ; ingred2 resulting DrugStrength Nodes where ActiveIngredientAmount nodes are comma separated: * Strength 1: 1mg - ingred1, 4mg-ingred2 * Strength 2: 2mg - ingred1, 5mg - ingred2 * Strength 3: 3mg-ingred1, 6mg - ingred2 """ strengths = row['CleanStrength'] active_ingreds = row['CleanActiveIngredient'] strength_lists = [] strength_lists.append(strengths.split(';')[0].split(',')) base_dcid = strength_format_map['strength_dcid'] strength_dcids = [] # get all lists in strength_list for strength_list in strengths.split(';')[1:]: strength_list_comma_sep = strength_list.split(',') strength_lists.append(strength_list_comma_sep) for index, stren in enumerate(strength_lists[0]): strength_dcids.append(base_dcid + '_' + str(index)) active_ingred_dcids = [] for ingred_index, ingred_pair_list in enumerate(strength_lists): strength = ingred_pair_list[index] ingred_name = active_ingreds.split( ';')[ingred_index].strip().title() ingred_dcid = write_active_ingred_node(mcf_file, strength, ingred_name) active_ingred_dcids.append(ingred_dcid) strength_format_map['strength_dcid'] = base_dcid + '_' + str(index) strength_format_map['name'] = (base_dcid + '_' + str(index)).replace( 'dcid:bio/', '') strength_format_map['active_ingred_dcids'] = ','.join( active_ingred_dcids) strength_format_map = { key: value for key, value in strength_format_map.items() if value } strength_templater = mcf_template_filler.Filler(STRENGTH_TEMPLATE, required_vars=['dcid']) strength_mcf = strength_templater.fill(strength_format_map) mcf_file.write(strength_mcf) return strength_dcids
def write_active_ingred_node(mcf_file, amount, ingredient): """Writes an active ingredient node in mcf format to mcf_file given an ingredient and the ingredient's amount. Sometimes the amount has two quantities as in '500mg/25ml (20mg/ml)', causing the need for the parentheses check seen in the function. Amount can be a single quantity or a quantity range. """ if '-' in amount and 'OMEGA-3' not in amount and 'SINGLE-USE' not in amount: amount_qty = get_qty_range_format(amount.split('(')[0]) else: amount_qty = get_qty_format(amount.split('(')[0]) if '(' in amount: if '-' in amount and 'OMEGA-3' not in amount and 'SINGLE-USE' not in amount: second_amount_qty = get_qty_range_format( amount.split('(')[1].replace(')', '')) else: second_amount_qty = get_qty_format( amount.split('(')[1].replace(')', '')) amount_qty = amount_qty + ',' + second_amount_qty name = (ingredient.strip() + '_' + amount_qty).strip() for special_format, replace_format in INGREDIENT_REPLACEMENTS.items(): name = name.replace(special_format, replace_format).strip() name = re.sub("[^0-9a-zA-Z_-]+", "", name).title() dcid = 'dcid:bio/' + name ingred_templater = mcf_template_filler.Filler(ACTIVE_INGRED_TEMPLATE, required_vars=['dcid']) ingred_mcf = ingred_templater.fill({ 'active_ingred_dcid': dcid, 'ingred_amount_qty': amount_qty, 'ingred_name': ingredient.strip(), 'name': name, }) mcf_file.write(ingred_mcf) return dcid
def parse_row(mcf_file, seen_fda_apps, row): """Writes nodes in mcf format to mcf_file. First writes FDA Application node. Parses strength nodes, writing Active Ingreident Amount nodes when necessary, then writes the strength nodes. Finally, one drug node is written per row. """ fda_app = 'dcid:bio/FDA_Application_' + str(row['ApplNo']) if row['ApplNo'] not in seen_fda_apps: app_template_map = { 'fda_app_dcid': fda_app, 'appl_num': str(row['ApplNo']), 'name': 'FDA_Application_' + str(row['ApplNo']), 'sponsor_name': row['SponsorName'].title(), 'appl_type_enums': row['ApplTypeEnum'], } app_template_map = { key: value for key, value in app_template_map.items() if value } fda_app_templater = mcf_template_filler.Filler(FDA_APP_TEMPLATE, required_vars=['dcid']) fda_app_mcf = fda_app_templater.fill(app_template_map) mcf_file.write(fda_app_mcf) seen_fda_apps.append(row['ApplNo']) strength_dcids = parse_strength_nodes(mcf_file, fda_app, row) ingred_name_list = '","'.join( [ingred.strip() for ingred in row['CleanActiveIngredient'].split(';')]) drug_format_map = { 'drug_ref': 'bio/' + row['DrugRef'], 'name': row['DrugRef'], 'synonyms': '","'.join(row['DrugName'].split(';')).title(), 'strength_dcids': ','.join(strength_dcids), 'ingred_names': ingred_name_list, 'dosage_form_enum': row['DosageFormEnums'], 'admin_route_enum': row['AdminRouteEnums'], 'additional_info': row['AdditionalInfo'], } if row['ReferenceStandard'] == 0: drug_format_map['is_ref_std'] = 'False' if row['ReferenceStandard'] and row['ReferenceStandard'] > 0: drug_format_map['is_ref_std'] = 'True' if row['ReferenceDrug'] == 0: drug_format_map['is_available_generically'] = 'False' if row['ReferenceDrug'] and row['ReferenceDrug'] > 0: drug_format_map['is_available_generically'] = 'True' drug_format_map = { key: value for key, value in drug_format_map.items() if value } drug_templater = mcf_template_filler.Filler(DRUG_TEMPLATE, required_vars=['dcid']) drug_mcf = drug_templater.fill(drug_format_map) mcf_file.write(drug_mcf)
def parse_strength_nodes(mcf_file, fda_app, row): """Determines if active ingredient nodes need to be generated and written to file by zipping Strength and ActiveIngredient columns. If the columns Strength and ActiveIngredient cannot be zipped, then single drug strength nodes is created and wirrten to mcf_file. This drug strength node has the strengths as a list of quantities and active ingredients as a list of strings. Otherwise the drug strength would point to Active Ingredient Amount nodes via dcids. """ strength_dcid = 'dcid:bio/' + row['DrugRef'] + '_Strength-' + str( row['ApplNo']) + '-' + str(row['ProductNo']) ingred_name_list = '","'.join( [ingred.strip() for ingred in row['CleanActiveIngredient'].split(';')]) strength_format_map = { 'strength_dcid': strength_dcid, 'fda_app_dcid': fda_app, 'fda_prod_no': str(row['ProductNo']), 'name': row['DrugRef'] + '_Strength-' + str(row['ApplNo']) + '-' + str(row['ProductNo']), 'ingred_names': ingred_name_list, 'te_enums': row['TECodes'], 'ms_enums': row['MarketStatus'], 'course_qty': row['DrugCourse'], 'is_single_dose': row['SingleDose'], 'sponsor': row['SponsorName'].title(), 'final_vol_qty': row['FinalVolQty'], } strengths = row['CleanStrength'] active_ingreds = row['CleanActiveIngredient'] if active_ingreds and strengths: if len(strengths.split(';')) == len(active_ingreds.split(';')): return zip_ingred_semi_sep(mcf_file, strength_format_map, row) if strengths.split(';')[0].count(',') == len( active_ingreds.split(';')) - 1: return zip_ingred_comma_sep(mcf_file, strength_format_map, row) strength_format_map['strength_qty'] = get_strength_qtys( row['CleanStrength']) strength_format_map = { key: value for key, value in strength_format_map.items() if value } strength_templater = mcf_template_filler.Filler(STRENGTH_TEMPLATE, required_vars=['dcid']) strength_mcf = strength_templater.fill(strength_format_map) mcf_file.write(strength_mcf) return [strength_dcid]