def test_replace_child(self): individual_name = Node(names.INDIVIDUALNAME) sur_name_1 = Node(names.SURNAME, parent=individual_name) sur_name_1.content = 'Gaucho' individual_name.add_child(sur_name_1) sur_name_2 = Node(names.SURNAME, parent=individual_name) sur_name_2.content = 'Carroll' self.assertIn(sur_name_1, individual_name.children) self.assertNotIn(sur_name_2, individual_name.children) individual_name.replace_child(old_child=sur_name_1, new_child=sur_name_2) self.assertIn(sur_name_2, individual_name.children) self.assertNotIn(sur_name_1, individual_name.children) # Test for old child removal from node store self.assertNotIn(sur_name_1.id, Node.store) # Test for child node type mismatch given_name = Node(names.GIVENNAME) given_name.content = 'Chase' try: individual_name.replace_child(old_child=sur_name_2, new_child=given_name) except ValueError as e: self.assertIsNotNone(e)
def _process_element(e, clean, literals) -> Node: """ Process an lxml etree element into a Metapype node. If the clean attribute is true, then remove leading and trailing whitespace from the element content. Args: e: lxml etree element clean: boolean to clean leading and trailing whitespace from node content literals: tuple of XML elements whose content should not be altered Returns: Node """ tag = e.tag[e.tag.find("}") + 1:] # Remove any prepended namespace node = Node(tag) node.nsmap = e.nsmap node.prefix = e.prefix if clean: if e.text is not None: if tag in literals: node.content = e.text else: # if text consists entirely of one or more spaces and/or non-breaking spaces, keep it if re.search("^[ \xA0]+$", e.text): node.content = e.text else: node.content = None if e.text.strip() == '' else " ".join( e.text.split()) if e.tail is not None: # if tail consists entirely of one or more spaces and/or non-breaking spaces, keep it if re.search("^[ \xA0]+$", e.tail): node.tail = e.tail else: node.tail = None if e.tail.strip() == '' else " ".join( e.tail.split()) else: node.content = e.text node.tail = e.tail for name, value in e.attrib.items(): if "{" not in name: node.add_attribute(name, value) else: nsname = _format_extras(name, node.nsmap) node.add_extras(nsname, value) for _ in e: if _.tag is not etree.Comment: node.add_child(_process_element(_, clean, literals)) for child in node.children: child.parent = node if child.nsmap == node.nsmap: child.nsmap = node.nsmap # Map to single instance of nsmap return node
def load_other_entity(dataset_node: Node = None, uploads_path: str = None, data_file: str = ''): full_path = f'{uploads_path}/{data_file}' other_entity_node = Node(names.OTHERENTITY, parent=dataset_node) add_child(dataset_node, other_entity_node) physical_node = Node(names.PHYSICAL, parent=other_entity_node) add_child(other_entity_node, physical_node) physical_node.add_attribute('system', 'EDI') entity_name_node = Node(names.ENTITYNAME, parent=other_entity_node) add_child(other_entity_node, entity_name_node) entity_name = entity_name_from_data_file(data_file) entity_name_node.content = entity_name object_name_node = Node(names.OBJECTNAME, parent=physical_node) add_child(physical_node, object_name_node) object_name_node.content = data_file file_size = get_file_size(full_path) if file_size is not None: size_node = Node(names.SIZE, parent=physical_node) add_child(physical_node, size_node) size_node.add_attribute('unit', 'byte') size_node.content = str(file_size) md5_hash = get_md5_hash(full_path) if md5_hash is not None: hash_node = Node(names.AUTHENTICATION, parent=physical_node) add_child(physical_node, hash_node) hash_node.add_attribute('method', 'MD5') hash_node.content = str(md5_hash) data_format_node = Node(names.DATAFORMAT, parent=physical_node) add_child(physical_node, data_format_node) externally_defined_format_node = Node(names.EXTERNALLYDEFINEDFORMAT, parent=data_format_node) add_child(data_format_node, externally_defined_format_node) format_name_node = Node(names.FORMATNAME, parent=externally_defined_format_node) add_child(externally_defined_format_node, format_name_node) format_name_node.content = format_name_from_data_file(data_file) entity_type_node = new_child_node(names.ENTITYTYPE, parent=other_entity_node) entity_type_node.content = format_name_from_data_file(data_file) delete_data_files(uploads_path) return other_entity_node
def test_validate_annotation(): annotation = Node(names.ANNOTATION) property_uri = Node(names.PROPERTYURI) property_uri.content = "http://purl.obolibrary.org/obo/IAO_0000136" property_uri.add_attribute("label", "some property label") annotation.add_child(property_uri) value_uri = Node(names.VALUEURI) value_uri.content = "http://purl.obolibrary.org/obo/IAO_0000136" value_uri.add_attribute("label", "some value label") annotation.add_child(value_uri) validate.tree(annotation)
def to_xml(node: Node, level: int = 0) -> str: xml = "" closed = False boiler = ('xmlns:eml="https://eml.ecoinformatics.org/eml-2.2.0" ' 'xmlns:stmml="http://www.xml-cml.org/schema/stmml-1.2" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xsi:schemaLocation="https://eml.ecoinformatics.org/eml-2.2.0 ' 'https://nis.lternet.edu/schemas/EML/eml-2.2.0/xsd/eml.xsd"') name = node.name attributes = "" for attribute in node.attributes: attributes += ' {0}="{1}"'.format(attribute, node.attributes[attribute]) if level == 0: indent = "" if name == "eml": name = node.name + ":" + node.name attributes += " " + boiler else: indent = space * level open_tag = "<" + name + attributes + ">" close_tag = "</" + name + ">" xml += indent + open_tag if node.content is not None: if isinstance(node.content, str): # if it hasn't been escaped already, escape it if all(x not in node.content for x in ('&', '<', '>')): node.content = escape(node.content) # Hopefully, this is a temporary hack. Need to figure out a better way... # The problem is that <para> tags are treated idiosyncratically because their rules aren't fully # supported. They appear within node content, unlike other tags. node.content = node.content.replace('<para>', '<para>').replace( '</para>', '</para>') xml += str(node.content) + close_tag + "\n" closed = True elif len(node.children) > 0: xml += "\n" for child in node.children: xml += to_xml(child, level + 1) if not closed: if len(node.children) > 0: xml += indent xml += close_tag + "\n" return xml
def test_taxonid(): taxonId = Node(names.TAXONID, parent=None) taxonId.content = "42" # without the provider, we should get an error with pytest.raises(MetapypeRuleError): validate.node(taxonId) # with the provider, it should be ok taxonId.add_attribute("provider", "https://www.itis.gov") validate.node(taxonId)
def test_to_json(self): eml = Node(names.EML) eml.add_attribute('packageId', 'edi.23.1') eml.add_attribute('system', 'metapype') access = Node(names.ACCESS, parent=eml) access.add_attribute('authSystem', 'pasta') access.add_attribute('order', 'allowFirst') eml.add_child(access) allow = Node(names.ALLOW, parent=access) access.add_child(allow) principal = Node(names.PRINCIPAL, parent=allow) principal.content = 'uid=gaucho,o=EDI,dc=edirepository,dc=org' allow.add_child(principal) permission = Node(names.PERMISSION, parent=allow) permission.content = 'all' allow.add_child(permission) j = mp_io.to_json(eml) self.assertIsInstance(j, str)
def test_delete_node_no_children(): eml = Node(names.EML) eml.add_attribute("packageId", "edi.23.1") eml.add_attribute("system", "metapype") access = Node(names.ACCESS, parent=eml) access.add_attribute("authSystem", "pasta") access.add_attribute("order", "allowFirst") eml.add_child(access) allow = Node(names.ALLOW, parent=access) access.add_child(allow) principal = Node(names.PRINCIPAL, parent=allow) principal.content = "uid=gaucho,o=EDI,dc=edirepository,dc=org" allow.add_child(principal) permission = Node(names.PERMISSION, parent=allow) permission.content = "all" allow.add_child(permission) node = Node.get_node_instance(principal.id) assert principal is node Node.delete_node_instance(eml.id, children=False) assert principal.id in Node.store
def test_delete_node_no_children(self): eml = Node(names.EML) eml.add_attribute('packageId', 'edi.23.1') eml.add_attribute('system', 'metapype') access = Node(names.ACCESS, parent=eml) access.add_attribute('authSystem', 'pasta') access.add_attribute('order', 'allowFirst') eml.add_child(access) allow = Node(names.ALLOW, parent=access) access.add_child(allow) principal = Node(names.PRINCIPAL, parent=allow) principal.content = 'uid=gaucho,o=EDI,dc=edirepository,dc=org' allow.add_child(principal) permission = Node(names.PERMISSION, parent=allow) permission.content = 'all' allow.add_child(permission) node = Node.get_node_instance(principal.id) self.assertIs(principal, node) Node.delete_node_instance(eml.id, children=False) self.assertIn(principal.id, Node.store)
def test_replace_child(): individual_name = Node(names.INDIVIDUALNAME) sur_name_1 = Node(names.SURNAME, parent=individual_name) sur_name_1.content = "Gaucho" individual_name.add_child(sur_name_1) sur_name_2 = Node(names.SURNAME, parent=individual_name) sur_name_2.content = "Carroll" assert sur_name_1 in individual_name.children assert sur_name_2 not in individual_name.children individual_name.replace_child(old_child=sur_name_1, new_child=sur_name_2) assert sur_name_2 in individual_name.children assert sur_name_1 not in individual_name.children # Test for old child removal from node store assert sur_name_1.id not in Node.store # Test for child node type mismatch given_name = Node(names.GIVENNAME) given_name.content = "Chase" with pytest.raises(ValueError): individual_name.replace_child(old_child=sur_name_2, new_child=given_name)
def test_missing_numerical_unit(): unit = Node(names.UNIT, parent=None) r = rule.get_rule(names.UNIT) with pytest.raises(MetapypeRuleError): r.validate_rule(unit) # Check error errs = [] validate.tree(unit, errs) assert len(errs) == 1 err_code, msg, node, *args = errs[0] assert err_code == ValidationError.MIN_CHOICE_UNMET assert args[0] == 'unit' # With a customUnit, it should be ok custom_unit = Node(names.CUSTOMUNIT, parent=unit) custom_unit.content = 'bushels per parsec' unit.add_child(custom_unit) validate.tree(unit)
def from_xml_element(xml_elem, metapype_node, metapype_parent): """ Creates a metapype node corresponding to an xml element. Args: xml_elem: the xml element. metapype_node: the metapype_node corresponding to that xml element. metapype_node == None, except at the root of the tree. metapype_parent: the parent metapype_node for this node. """ if metapype_node is None: # Will be None except at the root metapype_node = Node(name=xml_elem.tag, parent=metapype_parent) # xml_element_lookup_by_node_id[metapype_node.id] = (metapype_node, xml_elem) for name, value in xml_elem.attrib.items(): if '}' not in name: metapype_node.add_attribute(name, value) if xml_elem.text: metapype_node.content = xml_elem.text if metapype_parent is not None: metapype_parent.add_child(metapype_node) for xml_child in xml_elem: from_xml_element(xml_child, None, metapype_node)
def from_json(json_node: dict, parent: Node = None) -> Node: ''' Recursively traverse Python JSON and build a metapype model instance. Args: json_node: JSON converted to Python structure parent: parent node reference to child Returns: Node: Child node of decomposed and parsed JSON ''' # Get first inner JSON object from dict and discard outer _ = json_node.popitem() name = _[0] body = _[1] node = Node(name, id=body[0]['id']) if parent is not None: node.parent = parent attributes = body[1]['attributes'] if attributes is not None: for attribute in attributes: node.add_attribute(attribute, attributes[attribute]) content = body[2]['content'] if content is not None: node.content = content children = body[3]['children'] for child in children: child_node = from_json(child, node) node.add_child(child_node) return node
def test_bounding_altitudes(): bounding_coordinates = Node(names.BOUNDINGCOORDINATES, parent=None) bc_west = Node(names.WESTBOUNDINGCOORDINATE, parent=bounding_coordinates) bc_east = Node(names.EASTBOUNDINGCOORDINATE, parent=bounding_coordinates) bc_north = Node(names.NORTHBOUNDINGCOORDINATE, parent=bounding_coordinates) bc_south = Node(names.SOUTHBOUNDINGCOORDINATE, parent=bounding_coordinates) bc_west.content = "0.0" bc_east.content = "0.0" bc_north.content = "0.0" bc_south.content = "0.0" bounding_coordinates.add_child(bc_west) bounding_coordinates.add_child(bc_east) bounding_coordinates.add_child(bc_north) bounding_coordinates.add_child(bc_south) # without boundingAltitudes should be ok validate.node(bounding_coordinates) # boundingAltitudes should fail if not all required children present bounding_altitudes = Node(names.BOUNDINGALTITUDES, parent=bounding_coordinates) bounding_coordinates.add_child(bounding_altitudes) with pytest.raises(MetapypeRuleError): validate.tree(bounding_coordinates) altitude_minimum = Node(names.ALTITUDEMINIMUM, parent=bounding_altitudes) bounding_altitudes.add_child(altitude_minimum) with pytest.raises(MetapypeRuleError): validate.tree(bounding_coordinates) altitude_minimum.content = "0.0" with pytest.raises(MetapypeRuleError): validate.tree(bounding_coordinates) # boundingAltitudes should fail if not all required children have content altitude_maximum = Node(names.ALTITUDEMAXIMUM, parent=bounding_altitudes) bounding_altitudes.add_child(altitude_maximum) altitude_units = Node(names.ALTITUDEUNITS, parent=bounding_altitudes) bounding_altitudes.add_child(altitude_units) with pytest.raises(MetapypeRuleError): validate.tree(bounding_coordinates) # with content filled in, should pass altitude_maximum.content = "1000.0" altitude_units.content = "meter" validate.tree(bounding_coordinates)
def keyword(filename=None, node_id=None): eml_node = load_eml(filename=filename) dataset_node = eml_node.find_child(names.DATASET) if not dataset_node: dataset_node = Node(names.DATASET, parent=eml_node) add_child(eml_node, dataset_node) form = KeywordForm(filename=filename, node_id=node_id) form.init_keywords() # Process POST if request.method == 'POST' and BTN_CANCEL in request.form: url = url_for(PAGE_KEYWORD_SELECT, filename=filename) return redirect(url) # if request.method == 'POST' and form.validate_on_submit(): if request.method == 'POST': form_value = request.form form_dict = form_value.to_dict(flat=False) new_page = PAGE_KEYWORD_SELECT if form_dict: for key in form_dict: val = form_dict[key][0] # value is the first list element new_page = check_val_for_hidden_buttons( val, new_page, new_page) submit_type = None if is_dirty_form(form): submit_type = 'Save Changes' # flash(f'submit_type: {submit_type}') if submit_type == 'Save Changes': keyword = form.keyword.data keyword_type = form.keyword_type.data keyword_thesaurus = form.keyword_thesaurus.data # If so thesaurus was specified, see if the LTER Controlled Vocabulary applies if not keyword_thesaurus: lter_keywords = get_keywords('LTER') if keyword in lter_keywords: keyword_thesaurus = 'LTER Controlled Vocabulary' keyword_set_nodes = [] eml_node.find_all_descendants(names.KEYWORDSET, keyword_set_nodes) keyword_set_node = None for kws_node in keyword_set_nodes: keyword_thesaurus_node = kws_node.find_child( names.KEYWORDTHESAURUS) if keyword_thesaurus_node and keyword_thesaurus_node.content == keyword_thesaurus: keyword_set_node = kws_node break if not keyword_thesaurus_node and not keyword_thesaurus: keyword_set_node = kws_node break if not keyword_set_node: keyword_set_node = Node(names.KEYWORDSET, parent=dataset_node) add_child(dataset_node, keyword_set_node) if keyword_thesaurus: keyword_thesaurus_node = Node(names.KEYWORDTHESAURUS, parent=keyword_set_node) keyword_thesaurus_node.content = keyword_thesaurus keyword_set_node.children.append(keyword_thesaurus_node) keyword_node = Node(names.KEYWORD, parent=keyword_set_node) create_keyword(keyword_node, keyword, keyword_type) if node_id and len(node_id) != 1: old_keyword_node = Node.get_node_instance(node_id) if old_keyword_node: keyword_parent_node = old_keyword_node.parent keyword_parent_node.replace_child(old_keyword_node, keyword_node) else: msg = f"No keyword node found in the node store with node id {node_id}" raise Exception(msg) else: add_child(keyword_set_node, keyword_node) save_both_formats(filename=filename, eml_node=eml_node) url = url_for(new_page, filename=filename) return redirect(url) # Process GET if node_id == '1': form.init_md5() else: keyword_set_nodes = [] eml_node.find_all_descendants(names.KEYWORDSET, keyword_set_nodes) found = False for keyword_set_node in keyword_set_nodes: keyword_nodes = keyword_set_node.find_all_children(names.KEYWORD) keyword_thesaurus_node = keyword_set_node.find_child( names.KEYWORDTHESAURUS) if keyword_nodes: for kw_node in keyword_nodes: if node_id == kw_node.id: populate_keyword_form(form, kw_node, keyword_thesaurus_node) found = True break if found: break set_current_page('keyword') help = [get_help('keywords')] return render_template('keyword.html', title='Keyword', form=form, filename=filename, help=help)
def load_data_table(uploads_path: str = None, data_file: str = '', num_header_rows: str = '1', delimiter: str = ',', quote_char: str = '"'): # if Config.LOG_DEBUG: log_info(f'Entering load_data_table: {data_file}') full_path = f'{uploads_path}/{data_file}' datatable_node = metapype_client.new_child_node(names.DATATABLE, parent=None) physical_node = metapype_client.new_child_node(names.PHYSICAL, parent=datatable_node) physical_node.add_attribute('system', 'EDI') entity_name_node = metapype_client.new_child_node(names.ENTITYNAME, parent=datatable_node) entity_name = entity_name_from_data_file(data_file) entity_name_node.content = entity_name object_name_node = metapype_client.new_child_node(names.OBJECTNAME, parent=physical_node) object_name_node.content = data_file file_size = get_file_size(full_path) if file_size is not None: size_node = metapype_client.new_child_node(names.SIZE, physical_node) size_node.add_attribute('unit', 'byte') size_node.content = str(file_size) md5_hash = get_md5_hash(full_path) if md5_hash is not None: hash_node = Node(names.AUTHENTICATION, parent=physical_node) metapype_client.add_child(physical_node, hash_node) hash_node.add_attribute('method', 'MD5') hash_node.content = str(md5_hash) data_format_node = Node(names.DATAFORMAT, parent=physical_node) metapype_client.add_child(physical_node, data_format_node) text_format_node = Node(names.TEXTFORMAT, parent=data_format_node) metapype_client.add_child(data_format_node, text_format_node) num_header_lines_node = Node(names.NUMHEADERLINES, parent=text_format_node) metapype_client.add_child(text_format_node, num_header_lines_node) num_header_lines_node.content = num_header_rows num_footer_lines_node = Node(names.NUMFOOTERLINES, parent=text_format_node) metapype_client.add_child(text_format_node, num_footer_lines_node) num_footer_lines_node.content = '0' simple_delimited_node = Node(names.SIMPLEDELIMITED, parent=text_format_node) metapype_client.add_child(text_format_node, simple_delimited_node) field_delimiter_node = Node(names.FIELDDELIMITER, parent=simple_delimited_node) metapype_client.add_child(simple_delimited_node, field_delimiter_node) field_delimiter_node.content = delimiter quote_character_node = Node(names.QUOTECHARACTER, parent=simple_delimited_node) metapype_client.add_child(simple_delimited_node, quote_character_node) quote_character_node.content = quote_char if file_size == 0: raise DataTableError("The CSV file is empty.") check_column_name_uniqueness(full_path, delimiter) with open(full_path) as file: next(file) line_terminator = repr(file.newlines).replace("'", "") record_delimiter_node = Node(names.RECORDDELIMITER, parent=text_format_node) metapype_client.add_child(text_format_node, record_delimiter_node) record_delimiter_node.content = line_terminator # log_info('pd.read_csv') try: data_frame = pd.read_csv(full_path, encoding='utf8', sep=delimiter, quotechar=quote_char) except pd.errors.ParserError as e: raise DataTableError(e.args[0]) column_vartypes = [] column_names = [] column_categorical_codes = [] if data_frame is not None: number_of_records = Node(names.NUMBEROFRECORDS, parent=datatable_node) metapype_client.add_child(datatable_node, number_of_records) row_count = data_frame.shape[0] record_count = row_count number_of_records.content = f'{record_count}' attribute_list_node = Node(names.ATTRIBUTELIST, parent=datatable_node) metapype_client.add_child(datatable_node, attribute_list_node) # data_frame = data_frame.convert_dtypes() columns = data_frame.columns for col in columns: dtype = data_frame[col][1:].infer_objects().dtype # dtype = data_frame.dtypes[col] var_type, codes = infer_col_type(data_frame, col) log_info(f'col: {col} var_type: {var_type}') column_vartypes.append(var_type) column_names.append(col) column_categorical_codes.append(codes) attribute_node = metapype_client.new_child_node( names.ATTRIBUTE, attribute_list_node) attribute_name_node = metapype_client.new_child_node( names.ATTRIBUTENAME, attribute_node) attribute_name_node.content = col att_label_node = Node(names.ATTRIBUTELABEL, parent=attribute_node) metapype_client.add_child(attribute_node, att_label_node) att_label_node.content = col att_def_node = Node(names.ATTRIBUTEDEFINITION, parent=attribute_node) metapype_client.add_child(attribute_node, att_def_node) ms_node = Node(names.MEASUREMENTSCALE, parent=attribute_node) metapype_client.add_child(attribute_node, ms_node) missing_value_code = guess_missing_value_code( full_path, delimiter, quote_char, col) if missing_value_code: mv_node = Node(names.MISSINGVALUECODE, parent=attribute_node) metapype_client.add_child(attribute_node, mv_node) code_node = Node(names.CODE, parent=mv_node) metapype_client.add_child(mv_node, code_node) code_node.content = missing_value_code if var_type == metapype_client.VariableType.CATEGORICAL: codes = force_categorical_codes(attribute_node, dtype, codes) codes = force_missing_value_code(missing_value_code, dtype, codes) # nominal / nonNumericDomain / enumeratedDomain / ...codes... nominal_node = metapype_client.new_child_node( names.NOMINAL, ms_node) non_numeric_domain_node = metapype_client.new_child_node( names.NONNUMERICDOMAIN, nominal_node) enumerated_domain_node = metapype_client.new_child_node( names.ENUMERATEDDOMAIN, non_numeric_domain_node) for code in codes: code_definition_node = metapype_client.new_child_node( names.CODEDEFINITION, enumerated_domain_node) code_node = metapype_client.new_child_node( names.CODE, code_definition_node) code_node.content = str(code) definition_node = metapype_client.new_child_node( names.DEFINITION, code_definition_node) elif var_type == metapype_client.VariableType.NUMERICAL: # ratio / numericDomain ratio_node = metapype_client.new_child_node( names.RATIO, ms_node) numeric_domain_node = metapype_client.new_child_node( names.NUMERICDOMAIN, ratio_node) number_type = 'real' if str(dtype).startswith( 'int'): # FIXME - we can do better than this number_type = 'integer' number_type_node = metapype_client.new_child_node( names.NUMBERTYPE, numeric_domain_node) number_type_node.content = number_type numeric_domain_node = metapype_client.new_child_node( names.UNIT, ratio_node) elif var_type == metapype_client.VariableType.TEXT: # nominal / nonNumericDomain / textDomain nominal_node = metapype_client.new_child_node( names.NOMINAL, ms_node) non_numeric_domain_node = metapype_client.new_child_node( names.NONNUMERICDOMAIN, nominal_node) text_domain_node = metapype_client.new_child_node( names.TEXTDOMAIN, non_numeric_domain_node) definition_node = metapype_client.new_child_node( names.DEFINITION, text_domain_node) elif var_type == metapype_client.VariableType.DATETIME: # dateTime / formatString datetime_node = Node(names.DATETIME, parent=ms_node) metapype_client.add_child(ms_node, datetime_node) format_string_node = Node(names.FORMATSTRING, parent=datetime_node) metapype_client.add_child(datetime_node, format_string_node) format_string_node.content = codes # if Config.LOG_DEBUG: # log_info(f'Leaving load_data_table') return datatable_node, column_vartypes, column_names, column_categorical_codes, data_frame, missing_value_code
def node(): eml = Node(names.EML) eml.add_attribute("packageId", "edi.23.1") eml.add_attribute("system", "metapype") access = Node(names.ACCESS, parent=eml) access.add_attribute("authSystem", "pasta") access.add_attribute("order", "allowFirst") eml.add_child(access) allow = Node(names.ALLOW, parent=access) access.add_child(allow) principal_allow = Node(names.PRINCIPAL, parent=allow) principal_allow.content = "uid=gaucho,o=EDI,dc=edirepository,dc=org" allow.add_child(principal_allow) permission_allow = Node(names.PERMISSION, parent=allow) permission_allow.content = "all" allow.add_child(permission_allow) deny = Node(names.DENY, parent=access) access.add_child(deny) principal_deny = Node(names.PRINCIPAL, parent=deny) principal_deny.content = "public" deny.add_child(principal_deny) permission_deny = Node(names.PERMISSION, parent=deny) permission_deny.content = "write" deny.add_child(permission_deny) dataset = Node(names.DATASET, parent=eml) eml.add_child(dataset) title = Node(names.TITLE, parent=dataset) title.content = "Green sea turtle counts: Tortuga Island 20017" dataset.add_child(title) creator = Node(names.CREATOR, parent=dataset) dataset.add_child(creator) individualName_creator = Node(names.INDIVIDUALNAME, parent=creator) creator.add_child(individualName_creator) salutation_creator = Node(names.SALUTATION, parent=individualName_creator) salutation_creator.content = "Mr." individualName_creator.add_child(salutation_creator) given_name_creator = Node(names.GIVENNAME, parent=individualName_creator) given_name_creator.content = "Chase" individualName_creator.add_child(given_name_creator) surName_creator = Node(names.SURNAME, parent=individualName_creator) surName_creator.content = "Gaucho" individualName_creator.add_child(surName_creator) value = Node(names.VALUE, parent=surName_creator) value.add_attribute("lang", "en") value.content = "Gaucho" surName_creator.add_child(value) address = Node(names.ADDRESS, parent=creator) creator.add_child(address) delivery_point_1 = Node(names.DELIVERYPOINT, parent=address) delivery_point_1.content = "100 Maple St" address.add_child(delivery_point_1) delivery_point_2 = Node(names.DELIVERYPOINT, parent=address) delivery_point_2.content = "Apt. 10-B" address.add_child(delivery_point_2) city = Node(names.CITY, parent=address) city.content = "Gotham City" address.add_child(city) administrative_area = Node(names.ADMINISTRATIVEAREA, parent=address) administrative_area.content = "New York" address.add_child(administrative_area) postal_code = Node(names.POSTALCODE, parent=address) postal_code.content = "11111" address.add_child(postal_code) country = Node(names.COUNTRY, parent=address) country.content = "USA" address.add_child(country) phone = Node(names.PHONE, parent=creator) phone.content = "555-555-5555" phone.add_attribute("phonetype", "voice") creator.add_child(phone) electronic_mail_address = Node(names.ELECTRONICMAILADDRESS, parent=creator) electronic_mail_address.content = "*****@*****.**" creator.add_child(electronic_mail_address) online_url = Node(names.ONLINEURL, parent=creator) online_url.content = "https://www.somecollege.edu/people/cgaucho" creator.add_child(online_url) user_id = Node(names.USERID, parent=creator) user_id.content = "uid=jgaucho,o=EDI,dc=edirepository,dc=org" user_id.add_attribute( "directory", "ldap:///ldap.edirepository.org/dc=edirepository," "dc=org" ) creator.add_child(user_id) pubdate = Node(names.PUBDATE, parent=dataset) pubdate.content = "2018" dataset.add_child(pubdate) abstract = Node(names.ABSTRACT, parent=dataset) abstract.add_attribute("lang", "en") section = Node(names.SECTION, parent=abstract) abstract.add_child(section) para = Node(names.PARA, parent=abstract) section.add_child(para) para.content = "para section" dataset.add_child(abstract) keyword_set = Node(names.KEYWORDSET, parent=dataset) dataset.add_child(keyword_set) keyword_1 = Node(names.KEYWORD, parent=keyword_set) keyword_1.content = "phytoplankton ecology" keyword_set.add_child(keyword_1) keyword_2 = Node(names.KEYWORD, parent=keyword_set) keyword_2.add_attribute("keywordType", "place") keyword_2.content = "lake" keyword_set.add_child(keyword_2) keyword_thesaurus = Node(names.KEYWORDTHESAURUS, parent=keyword_set) keyword_thesaurus.content = "IRIS keyword thesaurus" keyword_set.add_child(keyword_thesaurus) coverage = Node(names.COVERAGE, parent=dataset) dataset.add_child(coverage) taxonomic_coverage = Node(names.TAXONOMICCOVERAGE, parent=coverage) coverage.add_child(taxonomic_coverage) general_taxonomic_coverage = Node( names.GENERALTAXONOMICCOVERAGE, parent=taxonomic_coverage ) taxonomic_coverage.add_child(general_taxonomic_coverage) general_taxonomic_coverage.content = "All vascular plants were \ identified to family or species, mosses and lichens were \ identified as moss or lichen." taxonomic_classification_genus = Node( names.TAXONOMICCLASSIFICATION, parent=taxonomic_coverage ) taxonomic_coverage.add_child(taxonomic_classification_genus) taxon_rank_name_genus = Node( names.TAXONRANKNAME, parent=taxonomic_classification_genus ) taxonomic_classification_genus.add_child(taxon_rank_name_genus) taxon_rank_name_genus.content = "Genus" taxon_rank_value_genus = Node( names.TAXONRANKVALUE, parent=taxonomic_classification_genus ) taxonomic_classification_genus.add_child(taxon_rank_value_genus) taxon_rank_value_genus.content = "Escherichia" taxonomic_classification_species = Node( names.TAXONOMICCLASSIFICATION, parent=taxonomic_classification_genus ) taxonomic_classification_genus.add_child(taxonomic_classification_species) taxon_rank_name_species = Node( names.TAXONRANKNAME, parent=taxonomic_classification_species ) taxonomic_classification_species.add_child(taxon_rank_name_species) taxon_rank_name_species.content = "Species" taxon_rank_value_species = Node( names.TAXONRANKVALUE, parent=taxonomic_classification_species ) taxonomic_classification_species.add_child(taxon_rank_value_species) taxon_rank_value_species.content = "coli" contact = Node(names.CONTACT, parent=dataset) dataset.add_child(contact) individualName_contact = Node(names.INDIVIDUALNAME, parent=contact) contact.add_child(individualName_contact) surName_contact = Node(names.SURNAME, parent=individualName_contact) surName_contact.content = "Gaucho" individualName_contact.add_child(surName_contact) additional_metadata = Node(names.ADDITIONALMETADATA, parent=eml) eml.add_child(additional_metadata) metadata = Node(names.METADATA, parent=additional_metadata) fictitious = Node("fictitious") fictitious.content = "<tag>more fictitious content</tag>" metadata.add_child(fictitious) additional_metadata.add_child(metadata) return eml
def load_other_entity(dataset_node: Node = None, uploads_path: str = None, data_file: str = '', node_id: str = None): full_path = f'{uploads_path}/{data_file}' doing_reupload = node_id is not None and node_id != '1' if doing_reupload: other_entity_node = Node.get_node_instance(node_id) object_name_node = other_entity_node.find_descendant(names.OBJECTNAME) else: other_entity_node = Node(names.OTHERENTITY, parent=dataset_node) metapype_client.add_child(dataset_node, other_entity_node) physical_node = Node(names.PHYSICAL, parent=other_entity_node) metapype_client.add_child(other_entity_node, physical_node) physical_node.add_attribute('system', 'EDI') entity_name_node = Node(names.ENTITYNAME, parent=other_entity_node) metapype_client.add_child(other_entity_node, entity_name_node) entity_name = entity_name_from_data_file(data_file) entity_name_node.content = entity_name object_name_node = Node(names.OBJECTNAME, parent=physical_node) metapype_client.add_child(physical_node, object_name_node) object_name_node.content = data_file file_size = get_file_size(full_path) if file_size is not None: if not doing_reupload: size_node = Node(names.SIZE, parent=physical_node) metapype_client.add_child(physical_node, size_node) size_node.add_attribute('unit', 'byte') else: size_node = other_entity_node.find_descendant(names.SIZE) size_node.content = str(file_size) md5_hash = get_md5_hash(full_path) if md5_hash is not None: if not doing_reupload: hash_node = Node(names.AUTHENTICATION, parent=physical_node) metapype_client.add_child(physical_node, hash_node) hash_node.add_attribute('method', 'MD5') else: hash_node = other_entity_node.find_descendant(names.AUTHENTICATION) hash_node.content = str(md5_hash) if not doing_reupload: data_format_node = Node(names.DATAFORMAT, parent=physical_node) metapype_client.add_child(physical_node, data_format_node) externally_defined_format_node = Node(names.EXTERNALLYDEFINEDFORMAT, parent=data_format_node) metapype_client.add_child(data_format_node, externally_defined_format_node) format_name_node = Node(names.FORMATNAME, parent=externally_defined_format_node) metapype_client.add_child(externally_defined_format_node, format_name_node) else: format_name_node = other_entity_node.find_descendant(names.FORMATNAME) format_name_node.content = format_name_from_data_file(data_file) if not doing_reupload: entity_type_node = metapype_client.new_child_node( names.ENTITYTYPE, parent=other_entity_node) else: entity_type_node = other_entity_node.find_descendant(names.ENTITYTYPE) entity_type_node.content = format_name_from_data_file(data_file) user_data.add_data_table_upload_filename(data_file) delete_data_files(uploads_path) return other_entity_node
def load_data_table(dataset_node: Node = None, uploads_path: str = None, data_file: str = ''): full_path = f'{uploads_path}/{data_file}' datatable_node = Node(names.DATATABLE, parent=dataset_node) add_child(dataset_node, datatable_node) physical_node = Node(names.PHYSICAL, parent=datatable_node) add_child(datatable_node, physical_node) physical_node.add_attribute('system', 'EDI') entity_name_node = Node(names.ENTITYNAME, parent=datatable_node) add_child(datatable_node, entity_name_node) entity_name = entity_name_from_data_file(data_file) entity_name_node.content = entity_name object_name_node = Node(names.OBJECTNAME, parent=physical_node) add_child(physical_node, object_name_node) object_name_node.content = data_file file_size = get_file_size(full_path) if file_size is not None: size_node = Node(names.SIZE, parent=physical_node) add_child(physical_node, size_node) size_node.add_attribute('unit', 'byte') size_node.content = str(file_size) data_format_node = Node(names.DATAFORMAT, parent=physical_node) add_child(physical_node, data_format_node) text_format_node = Node(names.TEXTFORMAT, parent=data_format_node) add_child(data_format_node, text_format_node) num_header_lines_node = Node(names.NUMHEADERLINES, parent=text_format_node) add_child(text_format_node, num_header_lines_node) num_header_lines_node.content = '1' num_footer_lines_node = Node(names.NUMFOOTERLINES, parent=text_format_node) add_child(text_format_node, num_footer_lines_node) num_footer_lines_node.content = '0' data_frame = pd.read_csv(full_path, comment='#') if data_frame is not None: number_of_records = Node(names.NUMBEROFRECORDS, parent=datatable_node) add_child(datatable_node, number_of_records) row_count = data_frame.shape[0] number_of_records.content = f'{row_count}' attribute_list_node = Node(names.ATTRIBUTELIST, parent=datatable_node) add_child(datatable_node, attribute_list_node) columns = data_frame.columns for col in columns: dtype = str(data_frame[col].dtype) print(f'{col}: {dtype}') attribute_node = Node(names.ATTRIBUTE, parent=attribute_list_node) add_child(attribute_list_node, attribute_node) attribute_name_node = Node(names.ATTRIBUTENAME, parent=attribute_node) add_child(attribute_node, attribute_name_node) attribute_name_node.content = col att_label_node = Node(names.ATTRIBUTELABEL, parent=attribute_node) add_child(attribute_node, att_label_node) att_label_node.content = col att_def_node = Node(names.ATTRIBUTEDEFINITION, parent=attribute_node) add_child(attribute_node, att_def_node) att_def_node.content = f'Attribute definition for {col}' ms_node = Node(names.MEASUREMENTSCALE, parent=attribute_node) add_child(attribute_node, ms_node) if dtype == 'bool': nominal_node = Node(names.NOMINAL, parent=ms_node) add_child(ms_node, nominal_node) non_numeric_domain_node = Node(names.NONNUMERICDOMAIN, parent=nominal_node) add_child(nominal_node, non_numeric_domain_node) elif dtype == 'object': if is_datetime_column(col): datetime_node = Node(names.DATETIME, parent=ms_node) add_child(ms_node, datetime_node) format_string_node = Node(names.FORMATSTRING, parent=datetime_node) add_child(datetime_node, format_string_node) format_string_node.content = '' else: nominal_node = Node(names.NOMINAL, parent=ms_node) add_child(ms_node, nominal_node) non_numeric_domain_node = Node(names.NONNUMERICDOMAIN, parent=nominal_node) add_child(nominal_node, non_numeric_domain_node) elif dtype.startswith('float') or dtype.startswith('int'): number_type = 'real' if dtype.startswith('int'): number_type = 'integer' ratio_node = Node(names.RATIO, parent=ms_node) add_child(ms_node, ratio_node) numeric_domain_ratio_node = Node(names.NUMERICDOMAIN, parent=ratio_node) add_child(ratio_node, numeric_domain_ratio_node) number_type_ratio_node = Node(names.NUMBERTYPE, parent=numeric_domain_ratio_node) add_child(numeric_domain_ratio_node, number_type_ratio_node) number_type_ratio_node.content = number_type delete_data_files(uploads_path) return datatable_node
def test_copy(self): node = Node(names.GIVENNAME) node.content = 'Chase' validate.node(node) node_copy = node.copy() validate.node(node_copy)
def load_data_table(uploads_path: str = None, data_file: str = '', num_header_rows: int = 1, delimiter: str = ',', quote_char: str = '"'): if Config.LOG_DEBUG: app = Flask(__name__) with app.app_context(): current_app.logger.info(f'Entering load_data_table') full_path = f'{uploads_path}/{data_file}' # datatable_node = new_child_node(names.DATATABLE, parent=dataset_node) datatable_node = new_child_node(names.DATATABLE, parent=None) physical_node = new_child_node(names.PHYSICAL, parent=datatable_node) physical_node.add_attribute('system', 'EDI') entity_name_node = new_child_node(names.ENTITYNAME, parent=datatable_node) entity_name = entity_name_from_data_file(data_file) entity_name_node.content = entity_name object_name_node = new_child_node(names.OBJECTNAME, parent=physical_node) object_name_node.content = data_file file_size = get_file_size(full_path) if file_size is not None: size_node = new_child_node(names.SIZE, physical_node) size_node.add_attribute('unit', 'byte') size_node.content = str(file_size) md5_hash = get_md5_hash(full_path) if md5_hash is not None: hash_node = Node(names.AUTHENTICATION, parent=physical_node) add_child(physical_node, hash_node) hash_node.add_attribute('method', 'MD5') hash_node.content = str(md5_hash) data_format_node = Node(names.DATAFORMAT, parent=physical_node) add_child(physical_node, data_format_node) text_format_node = Node(names.TEXTFORMAT, parent=data_format_node) add_child(data_format_node, text_format_node) num_header_lines_node = Node(names.NUMHEADERLINES, parent=text_format_node) add_child(text_format_node, num_header_lines_node) num_header_lines_node.content = num_header_rows num_footer_lines_node = Node(names.NUMFOOTERLINES, parent=text_format_node) add_child(text_format_node, num_footer_lines_node) num_footer_lines_node.content = '0' simple_delimited_node = Node(names.SIMPLEDELIMITED, parent=text_format_node) add_child(text_format_node, simple_delimited_node) field_delimiter_node = Node(names.FIELDDELIMITER, parent=simple_delimited_node) add_child(simple_delimited_node, field_delimiter_node) field_delimiter_node.content = delimiter quote_character_node = Node(names.QUOTECHARACTER, parent=simple_delimited_node) add_child(simple_delimited_node, quote_character_node) quote_character_node.content = quote_char with open(full_path) as file: next(file) line_terminator = repr(file.newlines).replace("'", "") record_delimiter_node = Node(names.RECORDDELIMITER, parent=text_format_node) add_child(text_format_node, record_delimiter_node) record_delimiter_node.content = line_terminator data_frame = pd.read_csv(full_path, comment='#', encoding='utf8', sep=delimiter, quotechar=quote_char) column_vartypes = [] column_names = [] column_categorical_codes = [] if data_frame is not None: number_of_records = Node(names.NUMBEROFRECORDS, parent=datatable_node) add_child(datatable_node, number_of_records) row_count = data_frame.shape[0] record_count = row_count number_of_records.content = f'{record_count}' attribute_list_node = Node(names.ATTRIBUTELIST, parent=datatable_node) add_child(datatable_node, attribute_list_node) columns = data_frame.columns for col in columns: dtype = data_frame[col][1:].infer_objects().dtype var_type, codes = infer_col_type(data_frame, col) column_vartypes.append(var_type) column_names.append(col) column_categorical_codes.append(codes) attribute_node = new_child_node(names.ATTRIBUTE, attribute_list_node) attribute_name_node = new_child_node(names.ATTRIBUTENAME, attribute_node) attribute_name_node.content = col att_label_node = Node(names.ATTRIBUTELABEL, parent=attribute_node) add_child(attribute_node, att_label_node) att_label_node.content = col att_def_node = Node(names.ATTRIBUTEDEFINITION, parent=attribute_node) att_def_node = Node(names.ATTRIBUTEDEFINITION, parent=attribute_node) add_child(attribute_node, att_def_node) ms_node = Node(names.MEASUREMENTSCALE, parent=attribute_node) add_child(attribute_node, ms_node) if var_type == VariableType.CATEGORICAL: # nominal / nonNumericDomain / enumeratedDomain / ...codes... nominal_node = new_child_node(names.NOMINAL, ms_node) non_numeric_domain_node = new_child_node( names.NONNUMERICDOMAIN, nominal_node) enumerated_domain_node = new_child_node( names.ENUMERATEDDOMAIN, non_numeric_domain_node) for code in codes: code_definition_node = new_child_node( names.CODEDEFINITION, enumerated_domain_node) code_node = new_child_node(names.CODE, code_definition_node) code_node.content = code definition_node = new_child_node(names.DEFINITION, code_definition_node) elif var_type == VariableType.NUMERICAL: # ratio / numericDomain ratio_node = new_child_node(names.RATIO, ms_node) numeric_domain_node = new_child_node(names.NUMERICDOMAIN, ratio_node) number_type = 'real' if str(dtype).startswith( 'int'): # FIXME - we can do better than this number_type = 'integer' number_type_node = new_child_node(names.NUMBERTYPE, numeric_domain_node) number_type_node.content = number_type numeric_domain_node = new_child_node(names.UNIT, ratio_node) elif var_type == VariableType.TEXT: # nominal / nonNumericDomain / textDomain nominal_node = new_child_node(names.NOMINAL, ms_node) non_numeric_domain_node = new_child_node( names.NONNUMERICDOMAIN, nominal_node) text_domain_node = new_child_node(names.TEXTDOMAIN, non_numeric_domain_node) definition_node = new_child_node(names.DEFINITION, text_domain_node) elif var_type == VariableType.DATETIME: # dateTime / formatString datetime_node = Node(names.DATETIME, parent=ms_node) add_child(ms_node, datetime_node) format_string_node = Node(names.FORMATSTRING, parent=datetime_node) add_child(datetime_node, format_string_node) format_string_node.content = codes if Config.LOG_DEBUG: app = Flask(__name__) with app.app_context(): current_app.logger.info(f'Leaving load_data_table') return datatable_node, column_vartypes, column_names, column_categorical_codes