def load_schemaorg_model(model_path): # instantiate schema explorer se = SchemaExplorer() se.load_schema(model_path) # visualize loaded schema full_schema = se.full_schema_graph() full_schema.engine = "fdp" full_schema.render(filename=os.path.basename("schema.org.model.pdf"), view=True) return se
class MetadataModel(object): """Metadata model wrapper around schema.org specification graph. Provides basic utilities to 1) manipulate the metadata model; 2) generate metadata model views: - generate manifest view of the metadata metadata model - usage getModelManifest(rootNode) - generate validation schemas view of the metadata model; - TODO: not currently part of the specification; to be defined. """ def __init__( self, inputMModelLocation: str, inputMModelLocationType: str, ) -> None: """ Instantiates MetadataModel object Args: se: a SchemaExplorer instance inputMModelLocation: local path, uri, synapse entity id; (e.g. gs://, syn123, /User/x/…); present location inputMModelLocationType: one of [local, gs, aws, synapse]; present location type """ self.se = SchemaExplorer() self.inputMModelLocationType = inputMModelLocationType self.inputMModelLocation = inputMModelLocation self.loadMModel() # setting mutators/accessors methods explicitly @property def inputMModelLocation(self) -> str: """Gets or sets the inputMModelLocation path""" return self.__inputMModelLocation @inputMModelLocation.setter def inputMModelLocation(self, inputMModelLocation) -> None: self.__inputMModelLocation = inputMModelLocation @property def inputMModelLocationType(self) -> str: """Gets or sets the inputMModelLocationType""" return self.__inputMModelLocationType @inputMModelLocationType.setter def inputMModelLocationType(self, inputMModelLocationType) -> None: self.__inputMModelLocationType = inputMModelLocationType @property def se(self) -> SchemaExplorer: """Gets or sets the SchemaExplorer instance""" return self.__se @se.setter def se(self, se: SchemaExplorer) -> None: self.__se = se # business logic: expose metadata model "views" depending on "controller" logic # (somewhat analogous to Model View Controller pattern for GUI/web applications) # i.e. jsonschemas, annotation manifests, metadata/annotation dictionary web explorer # are all "views" of the metadata model. # The "business logic" in this MetadataModel class provides functions exposing relevant parts # of the metadata model needed so that these views can be generated by user facing components; # controller components are (loosely speaking) responsible for handling the interaction between views and the model # some of these components right now reside in the Bundle class def loadMModel(self) -> None: """ load Schema; handles schema file input and sets mmodel """ self.se.load_schema(self.inputMModelLocation) def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph: """ get a schema subgraph from rootNode descendants on edges/node properties of type subgraphType Args: rootNode: a schema node label (i.e. term) subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels) Returns: a directed graph (networkx DiGraph) subgraph of the metadata model w/ vertex set root node descendants Raises: ValueError: rootNode not found in metadata model. """ pass def getModelManifest(self, rootNode: str, filenames: list = None) -> str: """ get annotations manifest dataframe Args: rootNode: a schema node label (i.e. term) Returns: a manifest URI (assume Google doc for now) Raises: ValueError: rootNode not found in metadata model. """ additionalMetadata = {} if filenames: additionalMetadata["Filename"] = filenames # TODO: remove reference to HTAN; have a manifestName attribute mg = ManifestGenerator(self.se, rootNode, "HTAN_" + rootNode, additionalMetadata) return mg.getManifest() def validateModelManifest(self, manifestPath: str, rootNode: str) -> list: """ check if provided annotations manifest dataframe satisfied all model requirements Args: rootNode: a schema node label (i.e. term) manifestPath: a path to the manifest csv file containing annotations Returns: a validation status message; if there is an error the message contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record Raises: TODO ValueError: rootNode not found in metadata model. """ # get validation schema for a given node in the data model jsonSchema = get_JSONSchema_requirements(self.se, rootNode, rootNode + "_validation") # get annotations from manifest (array of json annotations corresponding to manifest rows) manifest = pd.read_csv(manifestPath).fillna("") annotations = json.loads(manifest.to_json(orient='records')) errorPositions = [] for i, annotation in enumerate(annotations): try: validate(instance=annotation, schema=jsonSchema) # this error parsing is too brittle; if something changes in the validator code outputting the validation error we'd have to change the logic; TODO: provide a more robust error parsing except ValidationError as e: listExp = re.compile('\[(.*?)\]') errorRow = i + 2 # row in the manifest where the error occurred # parse the validation error in a more human readable form errorMessage = "At row " + str(errorRow) + ": " errors = str(e).split("\n") stringExp = re.compile('\'(.*?)\'') # extract wrong value entered errorValue = stringExp.findall(errors[0])[0] errorMessage += errors[0] # extract allowed values, if any, for the term that was erroneously filled in allowedValues = listExp.findall(errorMessage) if allowedValues: allowedValues = allowedValues[0].replace('\'', '').split(", ") errorDetail = errors[-2].replace("On instance", "At term") #extract the term(s) that had erroneously filled in values, if any errorTerms = listExp.findall(errorDetail) if errorTerms: errorTerms = errorTerms[0].replace('\'', '').split(", ")[0] errorMessage += "; " + errorDetail errorDetail = " value " + errors[-1].strip() + " is invalid;" errorMessage += errorDetail errorPositions.append( (errorRow, errorTerms, errorValue, allowedValues)) print(errorPositions) return errorPositions def populateModelManifest(self, manifestPath: str, rootNode: str) -> str: """ populate an existing annotations manifest based on a dataframe Args: rootNode: a schema node label (i.e. term) manifestPath: a path to the manifest csv file containing annotations Returns: a link to the filled in model manifest (e.g. google sheet) Raises: TODO ValueError: rootNode not found in metadata model. """ mg = ManifestGenerator(self.se, rootNode, "HTAN_" + rootNode, {}) emptyManifestURL = mg.getManifest() return mg.populateManifestSpreasheet(manifestPath, emptyManifestURL)
# instantiate schema explorer se = SchemaExplorer() # visualize biothings schema print("Visualizing BioThings schema...") full_schema = se.full_schema_graph() full_schema.render(filename=os.path.join(annotations_path, "biothings_schema.pdf"), view=True) print("Done") # load Sage annotations (that have been converted to JSON-LD; note that although a large set has been already converted # there are still annotation subsets that haven't been included) se.load_schema(os.path.join(annotations_path, base_sage_schema_file)) # visualize default base Sage schema print("Visualizing master Sage extension schema...") full_schema = se.full_schema_graph() full_schema.engine = "fdp" full_schema.render(filename=os.path.join(annotations_path, "master_sage_schema.pdf"), view=True) print("Done") print("Adding psychENCODE nodes...") ''' # add classes matching psychENCODE manifest specifications to biothings base ontology # for now we are hard-coding definitions; however, in the future we should have URI for each term and definition used in a dictionary # e.g. see https://schema.org/docs/schema_org_rdfa.html
""" ############################################### =============================================== ############################################### """ json_schema_output_dir = "./schemas" schemaorg_schema_input_dir = "./data" requires_dependency = "requiresDependency" requires_child = "requiresChildAsValue" if __name__ == "__main__": schemaorg_schema_file_name = "NFSchemaReq.jsonld" json_schema_file_name = "nf_jsonschema.json" se = SchemaExplorer() se.load_schema(os.path.join(schemaorg_schema_input_dir, schemaorg_schema_file_name)) g = se.get_nx_schema() json_schema = get_JSONSchema_requirements(se, "Thing", schema_name = "NFJSONschema") with open(os.path.join(json_schema_output_dir, json_schema_file_name), "w") as s_f: json.dump(json_schema, s_f, indent = 3)