예제 #1
0
def create_product_concepts(dataframe):
    """
    Given a Pandas dataframe of product data, create a concept
    for each row.

    :param pandas.DataFrame dataframe: Dataframe containing product data.
    :returns: Generator over concepts created from the dataframe.
    """
    Concept.set_ui_prefix("NHPID")

    concepts = []
    dataframe = dataframe[["product_id", "product_name"]].copy()
    for (i, row) in enumerate(dataframe.itertuples()):
        print(f"{i}/{dataframe.shape[0]}\r", end='')
        row = row._asdict()
        src_id = row["product_id"].replace(".0", '')
        term = str(row["product_name"])
        term = re.sub(r'\s+', ' ', term).strip()
        atom = Atom(term=term,
                    src="NHPID",
                    src_id=src_id,
                    term_type="SY",
                    is_preferred=True)
        concept = Concept(concept_type="DSP", atoms=[atom])
        concepts.append(concept)
    return concepts
예제 #2
0
def convert_products_to_concepts(json_data):
    Concept.set_ui_prefix("DSLD")
    concepts = []
    for line in json_data:
        atom = Atom(term=line["Product_Name"],
                    src="DSLD",
                    src_id=line["DSLD_ID"],
                    term_type="SY",
                    is_preferred=True)
        concept = Concept(concept_type="DSP", atoms=[atom])

        # LanguaL Product Type attribute
        if line["LanguaL_Product_Type"]:
            atr = Attribute(subject=concept,
                            atr_name="langual_type",
                            atr_value=line["LanguaL_Product_Type"],
                            src="DSLD")
            concept.add_elements(atr)

        for ing in line["ingredients"]:
            ing_id = ing["Ingredient_Group_GRP_ID"]
            has_ing_rel = Relationship(subject=concept,
                                       rel_name="has_ingredient",
                                       object=ing_id,
                                       src="DSLD")
            concept.add_elements(has_ing_rel)

        concepts.append(concept)
    return concepts
예제 #3
0
    def generate_idisk_schema(self, value, value_type, prefer_label,
                              concept_type, rel_name, from_concept):
        """
        Given the herb content:
        1. generate its Atom,
        2. generate Concept based on Atom generated from step 1
        3. generate Relationship based on the Concept generated from step 2
        4. return the subject and object Concepts after building this schema

        :param str/list value: herb content
        :param str value_type: herb content Atom type
        :param bool prefer_label: whether or not the term is preferred
        :param str concept_type: Concept type for the generated Atom
        :param str rel_name: Relationship type given the generated Concept
        :param Concept from_concept: the subject Concept of this schema

        :return: the subject Concept of this Relationship
        :rtype: Concept
        """
        value_atom = self.generate_atom(value, prefer_label, value_type)
        # if the atom does not have anything
        if len(value_atom) != 0:
            value_concept = Concept(concept_type, atoms=value_atom)
            from_concept, value_concept = self.generate_rel(
                                                from_concept, value_concept,
                                                rel_name)
            self.write_to_local_file(value_concept)
        return from_concept
def to_concepts(dataframe):
    Concept.set_ui_prefix("NHPID")

    dataframe = dataframe[[
        "ingredient_id", "proper_name", "proper_name_f", "common_name",
        "common_name_f"
    ]].copy()
    dataframe.dropna(subset=["proper_name"], axis=0, inplace=True)
    dataframe.drop_duplicates(inplace=True)
    dataframe.fillna("", inplace=True)
    synonym_cols = ["proper_name_f", "common_name", "common_name_f"]

    seen = set()
    for row in dataframe.itertuples():
        row = row._asdict()
        pref_term = row["proper_name"]
        src_id = row["ingredient_id"]
        if invalid_ingredient(pref_term):
            print(f"Removing {pref_term}")
            continue

        pref_atom = Atom(term=pref_term,
                         src="NHPID",
                         src_id=src_id,
                         term_type="SN",
                         is_preferred=True)
        atoms = [pref_atom]
        # Extract the synonyms, removing any empty strings
        # or duplicate string-termtype pairs.
        seen.add((pref_term, "SN"))
        for column in synonym_cols:
            term = row[column]
            tty = "SN" if column == "proper_name_f" else "SY"
            if not term:
                continue
            if (term, tty) in seen:
                continue
            seen.add((term, tty))
            atom = Atom(term=term,
                        src="NHPID",
                        src_id=row["ingredient_id"],
                        term_type=tty,
                        is_preferred=False)
            atoms.append(atom)

        concept = Concept.from_atoms(atoms, concept_type="SDSI")
        yield concept
예제 #5
0
def read_concepts_file(infile):
    concepts = []
    with open(infile, 'r') as inF:
        for (i, line) in enumerate(inF):
            data = json.loads(line)
            concept = Concept.from_dict(data)
            concepts.append(concept)
    return concepts
예제 #6
0
def convert_ingredients_to_concepts(dataframe):
    """
    Each row in dataframe corresponds to an ingredient concept.
    Create a Concept instance for each row.

    :param pd.Dataframe dataframe: Table containing ingredients data.
    :returns: Generator over SDSI concepts.
    :rtype: generator
    """
    dont_include = ["header", "fat calories", "polyunsaturated fat"]

    Concept.set_ui_prefix("DSLD")
    tty = "SY"  # All DSLD terms have term type SY
    # Create a Concept instance for each row.
    concepts = []
    for row in dataframe.itertuples():
        pref_term = row.group_name
        if pref_term.lower() in dont_include:
            continue
        src_id = str(row.group_id)
        pref_atom = Atom(term=pref_term,
                         src="DSLD",
                         src_id=src_id,
                         term_type=tty,
                         is_preferred=True)
        # The Atoms for this concept are its preferred term plus all synonyms.
        atoms = [pref_atom]
        seen = set([pref_term.lower()])
        for syn in row.synonyms:
            if not syn:
                continue
            if syn.lower() in seen:
                continue
            atom = Atom(term=syn,
                        src="DSLD",
                        src_id=src_id,
                        term_type=tty,
                        is_preferred=False)
            atoms.append(atom)
            seen.add(syn.lower())

        concept = Concept(concept_type="SDSI", atoms=atoms)

        if row.ingredient_category:
            category = row.ingredient_category.strip()
            a = Attribute(subject=concept,
                          atr_name="ingredient_category",
                          atr_value=category,
                          src="DSLD")
            concept.add_elements(a)
        concepts.append(concept)
    return concepts
예제 #7
0
def create_ingredient_concepts(dataframe):
    """
    Given a Pandas dataframe of ingredient data, create a concept
    for each row.

    :param pandas.DataFrame dataframe: Dataframe containing ingredient data.
    :returns: Generator over concepts created from the dataframe.
    """
    Concept.set_ui_prefix("NHPID")

    concepts = []

    dataframe = dataframe[[
        "ingredient_id", "product_id", "proper_name", "proper_name_f",
        "common_name", "common_name_f", "source_material", "source_material_f"
    ]].copy()
    dataframe.dropna(subset=["proper_name"], axis=0, inplace=True)
    dataframe.drop_duplicates(inplace=True)
    dataframe.fillna("", inplace=True)
    synonym_cols = ["proper_name_f", "common_name", "common_name_f"]

    seen = set()
    for (i, row) in enumerate(dataframe.itertuples()):
        print(f"{i}/{dataframe.shape[0]}\r", end='')
        row = row._asdict()
        pref_term = row["proper_name"]
        pref_term = re.sub(r'\s+', ' ', pref_term).strip()
        src_id = row["ingredient_id"].replace(".0", '')
        if invalid_ingredient_name(pref_term):
            print(f"Removing invalid ingredient with name '{pref_term}'")
            continue

        pref_atom = Atom(term=pref_term,
                         src="NHPID",
                         src_id=src_id,
                         term_type="SN",
                         is_preferred=True)
        atoms = [pref_atom]
        # Extract the synonyms, removing any empty strings
        # or duplicate string-termtype pairs.
        seen.add((pref_term, "SN"))
        for column in synonym_cols:
            term = row[column]
            term = re.sub(r'\s+', ' ', term).strip()
            tty = "SN" if column == "proper_name_f" else "SY"
            if not term or (term, tty) in seen:
                continue
            seen.add((term, tty))
            atom = Atom(term=term,
                        src="NHPID",
                        src_id=src_id,
                        term_type=tty,
                        is_preferred=False)
            atoms.append(atom)

        # Create the ingredient Concept
        concept = Concept(concept_type="SDSI", atoms=atoms)

        # Create the source attribute, if available.
        atr_val = None
        if row["source_material"]:
            atr_val = row["source_material"].strip()
        elif row["source_material_f"]:  # French translation of the above.
            atr_val = row["source_material_f"].strip()
        if atr_val is not None:
            a = Attribute(subject=concept,
                          atr_name="source_material",
                          atr_value=atr_val,
                          src="NHPID")
            concept.add_elements(a)

        product_id = row["product_id"].replace(".0", '')
        rel = Relationship(subject=concept,
                           rel_name="ingredient_of",
                           object=product_id,
                           src="NHPID")
        concept.add_elements(rel)

        concepts.append(concept)
    return concepts
예제 #8
0
    def iterate_mskcc_file(self):
        """
        For MSKCC source data ONLY
        Iterate the extracted JSONL file
        For each line, generate iDISK format for each input

        Mapping details:
        MSKCC headers           iDISK schemas           iDISK data types        Relationship (if any)  # noqa
        scientific_name         Scientific Name         Atom                    None  # noqa
        common_name             Synonym                 Atom                    None  # noqa
        clinical_summary        Background              Attribute               None  # noqa
        purported_uses          Diseases                Concept                 is_effective_for  # noqa
        mechanism_of_action     Mechanism of Action     Attribute               None  # noqa
        warnings                Safety                  Attribute               None  # noqa
        adverse_reactions       Signs                   Concept                 has_adverse_reaction  # noqa
                                Symptoms                                        adverse_reaction_of
        herb-drug_interactions  Pharmacological drug    Concept                 interact_with  # noqa
        original herb name      Preferred Name          Concept                 None  # noqa
                                Semantic
                                Dietary Supplement Ingredient (SDSI)
        """
        Concept.set_ui_prefix("MSKCC")
        with open(self.content_file, "r") as f:
            # use counter as herb id
            for line in f:
                items = json.loads(line)
                herb_atom = self.generate_atom(items["herb_name"],
                                               prefer_label=True,
                                               term_type="SY")
                # scientific name
                sn = items["scientific_name"]
                cn = items["common_name"]
                if len(sn) != 0:
                    sn = self.split_names(sn)
                    sn_atom = self.generate_atom(sn, False, "SN")
                    herb_atom.extend(sn_atom)
                # common_name
                if len(cn) != 0:
                    cn = self.split_names(cn)
                    cn_atom = self.generate_atom(cn, False, "CN")
                    herb_atom.extend(cn_atom)
                # build concept from names
                herb_concept = Concept("SDSI", atoms=herb_atom)
                # clinical_summary
                cs = items["clinical_summary"]
                herb_concept = self.generate_attr(herb_concept,
                                                  "background", cs)
                # mechanism_of_action
                moa = items["mechanism_of_action"]
                herb_concept = self.generate_attr(herb_concept,
                                                  "mechanism_of_action", moa)
                # warnings
                warn = items["warnings"]
                herb_concept = self.generate_attr(herb_concept,
                                                  "safety", warn)
                # purported_uses
                pu = items["purported_uses"]
                pu = self.split_content(pu)
                for each in pu:
                    herb_concept = self.generate_idisk_schema(
                                    each, "SY", False,
                                    "DIS", "is_effective_for",
                                    herb_concept)
                # adverse_reactions
                ar = items["adverse_reactions"]
                ar = self.split_content(ar)
                for each in ar:
                    herb_concept = self.generate_idisk_schema(
                                    each, "SY", False,
                                    "SS",
                                    "has_adverse_reaction",
                                    herb_concept)
                # herb-drug_interactions
                hdi = items["herb-drug_interactions"]
                hdi = self.remove_useless_for_HDI(hdi)
                hdi = self.split_content(hdi)
                for each in hdi:
                    herb_concept = self.generate_idisk_schema(
                                    each, "SY", False,
                                    "PD", "interacts_with",
                                    herb_concept)
                # write all concepts to local file
                self.write_to_local_file(herb_concept)