def load_schemaorg_model(model_path): # instantiate schema explorer se = SchemaExplorer() se.load_schema(model_path) # visualize loaded schema full_schema = se.full_schema_graph() full_schema.engine = "fdp" full_schema.render(filename=os.path.basename("schema.org.model.pdf"), view=True) return se
def __init__( self, inputMModelLocation: str, inputMModelLocationType: str, ) -> None: """ Instantiates MetadataModel object Args: se: a SchemaExplorer instance inputMModelLocation: local path, uri, synapse entity id; (e.g. gs://, syn123, /User/x/…); present location inputMModelLocationType: one of [local, gs, aws, synapse]; present location type """ self.se = SchemaExplorer() self.inputMModelLocationType = inputMModelLocationType self.inputMModelLocation = inputMModelLocation self.loadMModel()
value_constraint = { 'rdfs:requiresChildAsValue': { '@id': 'sms:' + str(requires_value) } } class_attributes.update(value_constraint) return class_attributes # path to schema metadata (output or input) schema_path = "./schemas" output_schema_name = "scRNASeq" # instantiate schema explorer se = SchemaExplorer() """ ###################################################### # first add the classes w/o dependencies to the schema ###################################################### """ ''' adding children classes to the Biosample class in biothing ''' class_req_add = get_class("BiosampleType",\ description = "The type of source material for the biosample",\ subclass_of = ["Biosample"], requires_value = True ) se.update_class(class_req_add)
value_constraint = { 'rdfs:requiresChildAsValue': { '@id': 'sms:' + str(requires_value) } } class_attributes.update(value_constraint) return class_attributes # path to schema metadata (output or input) schema_path = "./schemas" output_schema_name = "exampleSchemaReq" # instantiate schema explorer se = SchemaExplorer() """ ###################################################### # first add the classes w/o dependencies to the schema ###################################################### """ ''' adding fileFormat as a child of Thing ''' class_req_add = get_class("fileFormat",\ description = "Defined format of the data file, typically corresponding to extension, but sometimes indicating more general group of files produced by the same tool or software",\ subclass_of = "Thing" ) se.update_class(class_req_add) ''' adding resourceType as a child of Thing
#'http://schema.org/domainIncludes':{'@id': 'bts:' + property_class_name}, #'http://schema.org/rangeIncludes':{'@id': 'schema:' + allowed_values}, return new_property def first_upper(s): return s[0].upper() + s[1:] if len(s) > 0 else s annotations_path = "./data" annotations_file = "psychENCODE.json" base_sage_schema_file = "masterSage.jsonld" # instantiate schema explorer se = SchemaExplorer() # visualize biothings schema print("Visualizing BioThings schema...") full_schema = se.full_schema_graph() full_schema.render(filename=os.path.join(annotations_path, "biothings_schema.pdf"), view=True) print("Done") # load Sage annotations (that have been converted to JSON-LD; note that although a large set has been already converted # there are still annotation subsets that haven't been included) se.load_schema(os.path.join(annotations_path, base_sage_schema_file)) # visualize default base Sage schema
} return class_attributes def first_upper(s): return s[0].upper() + s[1:] if len(s) > 0 else s # path to Synapse annotations annotations_path = "./data" annotations_file = "sageCommunity.json" base_schema_org_file = "experimentalData.jsonld" # instantiate schema explorer se = SchemaExplorer() se.load_schema(os.path.join(annotations_path, base_schema_org_file)) # visualize default schema full_schema = se.full_schema_graph() full_schema.render(filename=os.path.join( annotations_path, annotations_file + "biothings_schema.gv.pdf"), view=True) # add adhoc classes; TODO: this should be generated based on a metadata model schema ''' # experimentalData classes new_class = get_class("Assay",\ description = "The technology used to generate the data in this file",\ subclass_of = "Thing"\ )
if requires_value != None: value_constraint = {'rdfs:requiresChildAsValue':{'@id':'sms:' + str(requires_value)}} class_attributes.update(value_constraint) return class_attributes # path to schema metadata (output or input) schema_path = "./schemas" output_schema_name = "HTAPP" # instantiate schema explorer se = SchemaExplorer() """ ###################################################### # first add the classes w/o dependencies to the schema ###################################################### """ class_req_add = get_class("HTAPP",\ description = "HTAPP minimal metadata extension",\ subclass_of = ["Thing"] ) se.update_class(class_req_add) class_req_add = get_class("HTANParticipantID",\
class MetadataModel(object): """Metadata model wrapper around schema.org specification graph. Provides basic utilities to 1) manipulate the metadata model; 2) generate metadata model views: - generate manifest view of the metadata metadata model - usage getModelManifest(rootNode) - generate validation schemas view of the metadata model; - TODO: not currently part of the specification; to be defined. """ def __init__( self, inputMModelLocation: str, inputMModelLocationType: str, ) -> None: """ Instantiates MetadataModel object Args: se: a SchemaExplorer instance inputMModelLocation: local path, uri, synapse entity id; (e.g. gs://, syn123, /User/x/…); present location inputMModelLocationType: one of [local, gs, aws, synapse]; present location type """ self.se = SchemaExplorer() self.inputMModelLocationType = inputMModelLocationType self.inputMModelLocation = inputMModelLocation self.loadMModel() # setting mutators/accessors methods explicitly @property def inputMModelLocation(self) -> str: """Gets or sets the inputMModelLocation path""" return self.__inputMModelLocation @inputMModelLocation.setter def inputMModelLocation(self, inputMModelLocation) -> None: self.__inputMModelLocation = inputMModelLocation @property def inputMModelLocationType(self) -> str: """Gets or sets the inputMModelLocationType""" return self.__inputMModelLocationType @inputMModelLocationType.setter def inputMModelLocationType(self, inputMModelLocationType) -> None: self.__inputMModelLocationType = inputMModelLocationType @property def se(self) -> SchemaExplorer: """Gets or sets the SchemaExplorer instance""" return self.__se @se.setter def se(self, se: SchemaExplorer) -> None: self.__se = se # business logic: expose metadata model "views" depending on "controller" logic # (somewhat analogous to Model View Controller pattern for GUI/web applications) # i.e. jsonschemas, annotation manifests, metadata/annotation dictionary web explorer # are all "views" of the metadata model. # The "business logic" in this MetadataModel class provides functions exposing relevant parts # of the metadata model needed so that these views can be generated by user facing components; # controller components are (loosely speaking) responsible for handling the interaction between views and the model # some of these components right now reside in the Bundle class def loadMModel(self) -> None: """ load Schema; handles schema file input and sets mmodel """ self.se.load_schema(self.inputMModelLocation) def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph: """ get a schema subgraph from rootNode descendants on edges/node properties of type subgraphType Args: rootNode: a schema node label (i.e. term) subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels) Returns: a directed graph (networkx DiGraph) subgraph of the metadata model w/ vertex set root node descendants Raises: ValueError: rootNode not found in metadata model. """ pass def getModelManifest(self, rootNode: str, filenames: list = None) -> str: """ get annotations manifest dataframe Args: rootNode: a schema node label (i.e. term) Returns: a manifest URI (assume Google doc for now) Raises: ValueError: rootNode not found in metadata model. """ additionalMetadata = {} if filenames: additionalMetadata["Filename"] = filenames # TODO: remove reference to HTAN; have a manifestName attribute mg = ManifestGenerator(self.se, rootNode, "HTAN_" + rootNode, additionalMetadata) return mg.getManifest() def validateModelManifest(self, manifestPath: str, rootNode: str) -> list: """ check if provided annotations manifest dataframe satisfied all model requirements Args: rootNode: a schema node label (i.e. term) manifestPath: a path to the manifest csv file containing annotations Returns: a validation status message; if there is an error the message contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record Raises: TODO ValueError: rootNode not found in metadata model. """ # get validation schema for a given node in the data model jsonSchema = get_JSONSchema_requirements(self.se, rootNode, rootNode + "_validation") # get annotations from manifest (array of json annotations corresponding to manifest rows) manifest = pd.read_csv(manifestPath).fillna("") annotations = json.loads(manifest.to_json(orient='records')) errorPositions = [] for i, annotation in enumerate(annotations): try: validate(instance=annotation, schema=jsonSchema) # this error parsing is too brittle; if something changes in the validator code outputting the validation error we'd have to change the logic; TODO: provide a more robust error parsing except ValidationError as e: listExp = re.compile('\[(.*?)\]') errorRow = i + 2 # row in the manifest where the error occurred # parse the validation error in a more human readable form errorMessage = "At row " + str(errorRow) + ": " errors = str(e).split("\n") stringExp = re.compile('\'(.*?)\'') # extract wrong value entered errorValue = stringExp.findall(errors[0])[0] errorMessage += errors[0] # extract allowed values, if any, for the term that was erroneously filled in allowedValues = listExp.findall(errorMessage) if allowedValues: allowedValues = allowedValues[0].replace('\'', '').split(", ") errorDetail = errors[-2].replace("On instance", "At term") #extract the term(s) that had erroneously filled in values, if any errorTerms = listExp.findall(errorDetail) if errorTerms: errorTerms = errorTerms[0].replace('\'', '').split(", ")[0] errorMessage += "; " + errorDetail errorDetail = " value " + errors[-1].strip() + " is invalid;" errorMessage += errorDetail errorPositions.append( (errorRow, errorTerms, errorValue, allowedValues)) print(errorPositions) return errorPositions def populateModelManifest(self, manifestPath: str, rootNode: str) -> str: """ populate an existing annotations manifest based on a dataframe Args: rootNode: a schema node label (i.e. term) manifestPath: a path to the manifest csv file containing annotations Returns: a link to the filled in model manifest (e.g. google sheet) Raises: TODO ValueError: rootNode not found in metadata model. """ mg = ManifestGenerator(self.se, rootNode, "HTAN_" + rootNode, {}) emptyManifestURL = mg.getManifest() return mg.populateManifestSpreasheet(manifestPath, emptyManifestURL)
""" ############################################### =============================================== ############################################### """ json_schema_output_dir = "./schemas" schemaorg_schema_input_dir = "./data" requires_dependency = "requiresDependency" requires_child = "requiresChildAsValue" if __name__ == "__main__": schemaorg_schema_file_name = "NFSchemaReq.jsonld" json_schema_file_name = "nf_jsonschema.json" se = SchemaExplorer() se.load_schema(os.path.join(schemaorg_schema_input_dir, schemaorg_schema_file_name)) g = se.get_nx_schema() json_schema = get_JSONSchema_requirements(se, "Thing", schema_name = "NFJSONschema") with open(os.path.join(json_schema_output_dir, json_schema_file_name), "w") as s_f: json.dump(json_schema, s_f, indent = 3)