def create_product_concepts(dataframe): """ Given a Pandas dataframe of product data, create a concept for each row. :param pandas.DataFrame dataframe: Dataframe containing product data. :returns: Generator over concepts created from the dataframe. """ Concept.set_ui_prefix("NHPID") concepts = [] dataframe = dataframe[["product_id", "product_name"]].copy() for (i, row) in enumerate(dataframe.itertuples()): print(f"{i}/{dataframe.shape[0]}\r", end='') row = row._asdict() src_id = row["product_id"].replace(".0", '') term = str(row["product_name"]) term = re.sub(r'\s+', ' ', term).strip() atom = Atom(term=term, src="NHPID", src_id=src_id, term_type="SY", is_preferred=True) concept = Concept(concept_type="DSP", atoms=[atom]) concepts.append(concept) return concepts
def convert_products_to_concepts(json_data): Concept.set_ui_prefix("DSLD") concepts = [] for line in json_data: atom = Atom(term=line["Product_Name"], src="DSLD", src_id=line["DSLD_ID"], term_type="SY", is_preferred=True) concept = Concept(concept_type="DSP", atoms=[atom]) # LanguaL Product Type attribute if line["LanguaL_Product_Type"]: atr = Attribute(subject=concept, atr_name="langual_type", atr_value=line["LanguaL_Product_Type"], src="DSLD") concept.add_elements(atr) for ing in line["ingredients"]: ing_id = ing["Ingredient_Group_GRP_ID"] has_ing_rel = Relationship(subject=concept, rel_name="has_ingredient", object=ing_id, src="DSLD") concept.add_elements(has_ing_rel) concepts.append(concept) return concepts
def generate_idisk_schema(self, value, value_type, prefer_label, concept_type, rel_name, from_concept): """ Given the herb content: 1. generate its Atom, 2. generate Concept based on Atom generated from step 1 3. generate Relationship based on the Concept generated from step 2 4. return the subject and object Concepts after building this schema :param str/list value: herb content :param str value_type: herb content Atom type :param bool prefer_label: whether or not the term is preferred :param str concept_type: Concept type for the generated Atom :param str rel_name: Relationship type given the generated Concept :param Concept from_concept: the subject Concept of this schema :return: the subject Concept of this Relationship :rtype: Concept """ value_atom = self.generate_atom(value, prefer_label, value_type) # if the atom does not have anything if len(value_atom) != 0: value_concept = Concept(concept_type, atoms=value_atom) from_concept, value_concept = self.generate_rel( from_concept, value_concept, rel_name) self.write_to_local_file(value_concept) return from_concept
def to_concepts(dataframe): Concept.set_ui_prefix("NHPID") dataframe = dataframe[[ "ingredient_id", "proper_name", "proper_name_f", "common_name", "common_name_f" ]].copy() dataframe.dropna(subset=["proper_name"], axis=0, inplace=True) dataframe.drop_duplicates(inplace=True) dataframe.fillna("", inplace=True) synonym_cols = ["proper_name_f", "common_name", "common_name_f"] seen = set() for row in dataframe.itertuples(): row = row._asdict() pref_term = row["proper_name"] src_id = row["ingredient_id"] if invalid_ingredient(pref_term): print(f"Removing {pref_term}") continue pref_atom = Atom(term=pref_term, src="NHPID", src_id=src_id, term_type="SN", is_preferred=True) atoms = [pref_atom] # Extract the synonyms, removing any empty strings # or duplicate string-termtype pairs. seen.add((pref_term, "SN")) for column in synonym_cols: term = row[column] tty = "SN" if column == "proper_name_f" else "SY" if not term: continue if (term, tty) in seen: continue seen.add((term, tty)) atom = Atom(term=term, src="NHPID", src_id=row["ingredient_id"], term_type=tty, is_preferred=False) atoms.append(atom) concept = Concept.from_atoms(atoms, concept_type="SDSI") yield concept
def read_concepts_file(infile): concepts = [] with open(infile, 'r') as inF: for (i, line) in enumerate(inF): data = json.loads(line) concept = Concept.from_dict(data) concepts.append(concept) return concepts
def convert_ingredients_to_concepts(dataframe): """ Each row in dataframe corresponds to an ingredient concept. Create a Concept instance for each row. :param pd.Dataframe dataframe: Table containing ingredients data. :returns: Generator over SDSI concepts. :rtype: generator """ dont_include = ["header", "fat calories", "polyunsaturated fat"] Concept.set_ui_prefix("DSLD") tty = "SY" # All DSLD terms have term type SY # Create a Concept instance for each row. concepts = [] for row in dataframe.itertuples(): pref_term = row.group_name if pref_term.lower() in dont_include: continue src_id = str(row.group_id) pref_atom = Atom(term=pref_term, src="DSLD", src_id=src_id, term_type=tty, is_preferred=True) # The Atoms for this concept are its preferred term plus all synonyms. atoms = [pref_atom] seen = set([pref_term.lower()]) for syn in row.synonyms: if not syn: continue if syn.lower() in seen: continue atom = Atom(term=syn, src="DSLD", src_id=src_id, term_type=tty, is_preferred=False) atoms.append(atom) seen.add(syn.lower()) concept = Concept(concept_type="SDSI", atoms=atoms) if row.ingredient_category: category = row.ingredient_category.strip() a = Attribute(subject=concept, atr_name="ingredient_category", atr_value=category, src="DSLD") concept.add_elements(a) concepts.append(concept) return concepts
def create_ingredient_concepts(dataframe): """ Given a Pandas dataframe of ingredient data, create a concept for each row. :param pandas.DataFrame dataframe: Dataframe containing ingredient data. :returns: Generator over concepts created from the dataframe. """ Concept.set_ui_prefix("NHPID") concepts = [] dataframe = dataframe[[ "ingredient_id", "product_id", "proper_name", "proper_name_f", "common_name", "common_name_f", "source_material", "source_material_f" ]].copy() dataframe.dropna(subset=["proper_name"], axis=0, inplace=True) dataframe.drop_duplicates(inplace=True) dataframe.fillna("", inplace=True) synonym_cols = ["proper_name_f", "common_name", "common_name_f"] seen = set() for (i, row) in enumerate(dataframe.itertuples()): print(f"{i}/{dataframe.shape[0]}\r", end='') row = row._asdict() pref_term = row["proper_name"] pref_term = re.sub(r'\s+', ' ', pref_term).strip() src_id = row["ingredient_id"].replace(".0", '') if invalid_ingredient_name(pref_term): print(f"Removing invalid ingredient with name '{pref_term}'") continue pref_atom = Atom(term=pref_term, src="NHPID", src_id=src_id, term_type="SN", is_preferred=True) atoms = [pref_atom] # Extract the synonyms, removing any empty strings # or duplicate string-termtype pairs. seen.add((pref_term, "SN")) for column in synonym_cols: term = row[column] term = re.sub(r'\s+', ' ', term).strip() tty = "SN" if column == "proper_name_f" else "SY" if not term or (term, tty) in seen: continue seen.add((term, tty)) atom = Atom(term=term, src="NHPID", src_id=src_id, term_type=tty, is_preferred=False) atoms.append(atom) # Create the ingredient Concept concept = Concept(concept_type="SDSI", atoms=atoms) # Create the source attribute, if available. atr_val = None if row["source_material"]: atr_val = row["source_material"].strip() elif row["source_material_f"]: # French translation of the above. atr_val = row["source_material_f"].strip() if atr_val is not None: a = Attribute(subject=concept, atr_name="source_material", atr_value=atr_val, src="NHPID") concept.add_elements(a) product_id = row["product_id"].replace(".0", '') rel = Relationship(subject=concept, rel_name="ingredient_of", object=product_id, src="NHPID") concept.add_elements(rel) concepts.append(concept) return concepts
def iterate_mskcc_file(self): """ For MSKCC source data ONLY Iterate the extracted JSONL file For each line, generate iDISK format for each input Mapping details: MSKCC headers iDISK schemas iDISK data types Relationship (if any) # noqa scientific_name Scientific Name Atom None # noqa common_name Synonym Atom None # noqa clinical_summary Background Attribute None # noqa purported_uses Diseases Concept is_effective_for # noqa mechanism_of_action Mechanism of Action Attribute None # noqa warnings Safety Attribute None # noqa adverse_reactions Signs Concept has_adverse_reaction # noqa Symptoms adverse_reaction_of herb-drug_interactions Pharmacological drug Concept interact_with # noqa original herb name Preferred Name Concept None # noqa Semantic Dietary Supplement Ingredient (SDSI) """ Concept.set_ui_prefix("MSKCC") with open(self.content_file, "r") as f: # use counter as herb id for line in f: items = json.loads(line) herb_atom = self.generate_atom(items["herb_name"], prefer_label=True, term_type="SY") # scientific name sn = items["scientific_name"] cn = items["common_name"] if len(sn) != 0: sn = self.split_names(sn) sn_atom = self.generate_atom(sn, False, "SN") herb_atom.extend(sn_atom) # common_name if len(cn) != 0: cn = self.split_names(cn) cn_atom = self.generate_atom(cn, False, "CN") herb_atom.extend(cn_atom) # build concept from names herb_concept = Concept("SDSI", atoms=herb_atom) # clinical_summary cs = items["clinical_summary"] herb_concept = self.generate_attr(herb_concept, "background", cs) # mechanism_of_action moa = items["mechanism_of_action"] herb_concept = self.generate_attr(herb_concept, "mechanism_of_action", moa) # warnings warn = items["warnings"] herb_concept = self.generate_attr(herb_concept, "safety", warn) # purported_uses pu = items["purported_uses"] pu = self.split_content(pu) for each in pu: herb_concept = self.generate_idisk_schema( each, "SY", False, "DIS", "is_effective_for", herb_concept) # adverse_reactions ar = items["adverse_reactions"] ar = self.split_content(ar) for each in ar: herb_concept = self.generate_idisk_schema( each, "SY", False, "SS", "has_adverse_reaction", herb_concept) # herb-drug_interactions hdi = items["herb-drug_interactions"] hdi = self.remove_useless_for_HDI(hdi) hdi = self.split_content(hdi) for each in hdi: herb_concept = self.generate_idisk_schema( each, "SY", False, "PD", "interacts_with", herb_concept) # write all concepts to local file self.write_to_local_file(herb_concept)