Exemplo n.º 1
0
    def _create_schema_from_graph(self, g):
        # identify schema attributes
        # TODO: identify rdf_url from graph instead
        # some cases where the url is not the same as the uri

        get_schema_keys = [OWL.Ontology, RDFS.Class, RDF.Property, None]

        rdf_url = None

        for key in get_schema_keys:
            try:
                schema_subject, _, _ = next(g.triples((None, RDF.type, key)))
                rdf_url = str(schema_subject)
                break
            except Exception as e:
                pass

        if rdf_url is None:
            return None

        label_keys = [DC.title, RDFS.label, DCTERMS.title]

        description_keys = [DC.description, DCTERMS.description]

        label = "Not available"
        description = "Not available"

        for key in label_keys:
            try:
                rdf_url, _, label = next(g.triples(
                    (schema_subject, key, None)))
                break
            except:
                pass

        for key in description_keys:
            try:
                rdf_url, _, description = next(
                    g.triples((schema_subject, key, None)))
                break
            except:
                pass

        # only save if it does not exists
        try:
            self.schema = Schema.objects.get(url=str(rdf_url))
        except Exception as e:
            self.schema = Schema(label=self.standardize_string(
                label, remove_version=True),
                                 url=str(rdf_url),
                                 description=str(description))
            if self.save_to_db:
                self.schema.save()
        return self.schema
Exemplo n.º 2
0
    def test_identify_from_json_data_strava_test(self):
        from MetaDataApi.metadata.services import (
            RdfSchemaService, DataCleaningService,
            SchemaIdentificationV2, RdfInstanceService)

        from MetaDataApi.metadata.models import Schema, Object, Attribute, ObjectRelation
        from django.contrib.auth.models import User

        rdf_inst = RdfInstanceService()

        # data_cleaning = DataCleaningService()

        LoadTestData.init_foaf()

        user = LoadTestData.init_user()

        # schema = LoadTestData.init_strava_schema_from_file()

        schema = Schema(label="strava")
        schema.save()

        # objects = LoadTestData.init_strava_data_from_file()

        service = SchemaIdentificationV2()

        data = UtilsForTesting.loadStravaActivities()

        objects = service.identify_from_json_data(
            data, schema, user, parrent_label="activities")

        metaobj = list(Object.objects.all())
        metaobj = list(ObjectRelation.objects.all())
        metaobj = list(Attribute.objects.all())

        labels = list(map(lambda x: "%s - %s" %
                          (x.base.label, str(type(x).__name__)), objects))

        RdfSchemaService().export_schema_from_db(schema)

        file = RdfInstanceService().export_instances_to_rdf_file(schema, objects)

        print(schema.url)

        expected = ['activities - ObjectInstance', 'person__to__activities - ObjectRelationInstance', 'athlete - ObjectInstance', 'activities__to__athlete - ObjectRelationInstance', 'start_latlng - ObjectInstance', 'activities__to__start_latlng - ObjectRelationInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - ObjectInstance', 'activities__to__end_latlng - ObjectRelationInstance', 'end_latlng - FloatAttributeInstance', 'location_city - ObjectInstance', 'activities__to__location_city - ObjectRelationInstance', 'location_state - ObjectInstance', 'activities__to__location_state - ObjectRelationInstance', 'map - ObjectInstance', 'activities__to__map - ObjectRelationInstance', 'gear_id - ObjectInstance', 'activities__to__gear_id - ObjectRelationInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance',
                    'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance']

        labels.sort()
        expected.sort()

        self.assertEqual(labels, expected)
Exemplo n.º 3
0
    def create_new_empty_schema(self, schema_label):
        self.schema = Schema()
        self.schema.label = self.standardize_string(schema_label)
        self.schema.description = ""
        self.schema.url = "temp"
        # quick fix for saving without conflicting with unique url

        # create a dummy file
        content = ContentFile("")
        self.schema.rdfs_file.save(schema_label + ".ttl", content)
        self.schema.url = self.schema.rdfs_file.url
        self.schema.save()

        self.touched_meta_items.append(self.schema)

        return self.schema
Exemplo n.º 4
0
    def test_identify_json_data_sample(self):
        from MetaDataApi.metadata.services import (
            SchemaIdentificationV2)

        from MetaDataApi.metadata.models import Schema, Object

        LoadTestData.init_foaf()

        LoadTestData.init_open_m_health_sample(extras=[
            "body-temperature-2.0.json",
            "body-temperature-2.x.json",
        ])

        url = "https://raw.githubusercontent.com/Grusinator/MetaDataApi" + \
            "/master/schemas/json/omh/test_data/body-temperature/2.0/" + \
            "shouldPass/valid-temperature.json"

        obj_count = Object.objects.all().count()
        # make sure that the number of objects is larger than
        if obj_count < 10:
            raise AssertionError("database not populated")

        with request.urlopen(url) as resp:
            text = resp.read().decode()

        service = SchemaIdentificationV2()
        schema = service._try_get_item(Schema(label="open_m_health"))

        input_data = {
            "body-temperature": json.loads(text)
        }

        objs = service.map_data_to_native_instances(input_data, schema)

        self.assertEqual(len(objs), 4)
Exemplo n.º 5
0
    def init_strava_data_from_file():
        from MetaDataApi.metadata.services import (
            RdfInstanceService, RdfSchemaService, DataCleaningService,
            SchemaIdentificationV2
        )
        from MetaDataApi.metadata.models import Schema, Object, Attribute

        user = LoadTestData.init_user()

        service = SchemaIdentificationV2()

        # load the file
        testfile = os.path.join(
            settings.BASE_DIR,
            "MetaDataApi/metadata/tests/data/json/strava_activities.json")
        with open(testfile) as f:
            data = json.loads(f.read())

        schema = service._try_get_item(Schema(label="strava"))

        label = "activities"

        objects = service.map_data_to_native_instances(
            data, schema, parrent_label=label, owner=user)

        return objects
    def test_(self):
        from MetaDataApi.metadata.services import (RdfSchemaService,
                                                   RdfInstanceService)

        from MetaDataApi.metadata.models import (Object, Schema, Attribute,
                                                 ObjectRelation)

        from MetaDataApi.metadata.models import (RawData, CategoryTypes,
                                                 ObjectInstance,
                                                 ObjectRelationInstance,
                                                 FloatAttributeInstance,
                                                 StringAttributeInstance)

        LoadTestData.init_foaf()

        service = RdfInstanceService()

        schema_label = "friend_of_a_friend"

        schema = service._try_get_item(Schema(label=schema_label))

        foaf_atts = Attribute.objects.filter(object__schema=schema)
        s = list(filter(lambda x: x.label, foaf_atts))

        foaf_person = service.get_foaf_person()
        foaf_name = Attribute.objects.get(label="first_name",
                                          object__schema=schema)

        foaf_knows = ObjectRelation.objects.get(label="knows", schema=schema)

        b1 = ObjectInstance(base=foaf_person)
        b2 = ObjectInstance(base=foaf_person)
        b1.save()
        b2.save()

        name1 = StringAttributeInstance(base=foaf_name, object=b1, value="B1")
        name2 = StringAttributeInstance(base=foaf_name, object=b2, value="B2")
        name1.save()
        name2.save()

        rel1 = ObjectRelationInstance(base=foaf_knows,
                                      from_object=b1,
                                      to_object=b2)

        rel1.save()

        objects = [b1, b2, name1, name2, rel1]

        rdf_file = service.export_instances_to_rdf_file(schema, objects)

        self.assertIsNotNone(rdf_file.url)
    def test_identify_json_data_sample(self):
        from MetaDataApi.metadata.services.data_cleaning_service import (
            DataCleaningService)

        from MetaDataApi.metadata.models import Schema, Object

        dc_service = DataCleaningService()

        schema_label = "open_m_health"
        schema = dc_service._try_get_item(Schema(label=schema_label))

        dc_service.relate_root_classes_to_foaf(schema)

        self.assertEqual(1 + 1, 2)
    def write_to_db(self, input_url, schema_label):
        self.baseurl, filename = self._infer_info_split_url(input_url)
        label = filename.replace(".json", "")

        data = self._read_json_from_url(input_url)

        schema_label = self.standardize_string(schema_label,
                                               remove_version=False)

        self.schema = self._try_get_item(Schema(label=schema_label))
        if not self.schema:
            self.schema = self.create_new_empty_schema(schema_label)

        return_objects = self._iterate_schema(data, label, filename=filename)

        self.schema = None

        return self.touched_meta_items
Exemplo n.º 9
0
class BaseMetaDataService():
    def __init__(self):
        # super(BaseMetaDataService, self).__init__()
        self.schema = None
        self.baseurl = None
        self.foaf_person = None

        self.added_meta_items = []
        self.touched_meta_items = []
        self.added_instance_items = []
        self._error_list = []
        # one switch to force overwrite the objects when
        self.overwrite_db_objects = False
        # -- same -- but to disable saving to db
        self.save_to_db = True

        # dont use the same if it allready exists
        self.allways_create_new = False

        self.att_inst_to_type_map = {
            #  StringAttributeInstance: str,
            StringAttributeInstance: str,
            DateTimeAttributeInstance: datetime,
            FloatAttributeInstance: float,
            IntAttributeInstance: int,
            BoolAttributeInstance: bool
        }

        self.att_types = tuple(typ if isinstance(typ, type) else type(typ)
                               for typ in Attribute.data_type_map.keys())

        self.att_instances = tuple(self.att_inst_to_type_map.keys())

        self.instances = self.att_instances + \
            (ObjectInstance, ObjectRelationInstance)

    def inverse_dict(self, dicti, value):
        try:
            keys = list(dicti.keys())
            values = list(dicti.values())
            index = values.index(value)
            return keys[index]
        except Exception as e:
            return None

    def standardize_string(self, string, remove_version=False):
        string = inflection.underscore(str(string))
        string = string.replace(".json", "")

        string = string.replace(" ", "_")

        # remove any version numbers
        if remove_version:
            string = re.sub(r"(|_version|_v|_v.)(|_)\d+\.(\d+|x)(|_)", '',
                            string)

        string = re.sub("(|_)vocabulary(|_)", '', string)

        # remove parenthesis with content
        string = re.sub(r'(|_)\([^)]*\)', '', string)

        # remove trailing and leading whitespace/underscore
        # string = re.sub('/^[\W_]+|[\W_]+$/', '', string)

        return string

    def rest_endpoint_to_label(self, endpoint):
        # TODO the last might not be the most relevant
        endpoint_without_args = endpoint.split("?")[0]
        last_elm = endpoint_without_args.split("/")[-1]
        return self.standardize_string(last_elm)

    def create_new_empty_schema(self, schema_label):
        self.schema = Schema()
        self.schema.label = self.standardize_string(schema_label)
        self.schema.description = ""
        self.schema.url = "temp"
        # quick fix for saving without conflicting with unique url

        # create a dummy file
        content = ContentFile("")
        self.schema.rdfs_file.save(schema_label + ".ttl", content)
        self.schema.url = self.schema.rdfs_file.url
        self.schema.save()

        self.touched_meta_items.append(self.schema)

        return self.schema

    def is_meta_item_in_created_list(self, item, item_list=None):
        item_list = item_list or self.touched_meta_items

        # new __eq__implementation
        return next(filter(item.__eq__, item_list), None)

        # old rubbish
        same_labels = filter(lambda x: item.label == x.label, item_list)

        if isinstance(item, Object):
            for same_label in same_labels:
                if same_label.to_relations.all().count() == 0:
                    return True
                for relation in same_label.to_relations.all():
                    item_to_relation_objects = [
                        rel.to_object for rel in item.from_relations.all()
                    ]
                    if relation.to_object in item_to_relation_objects:
                        return True
        elif isinstance(item, Attribute):
            for same_label in same_labels:
                if same_label.object == item.object:
                    return True
        elif isinstance(item, ObjectRelation):
            for in_list in item_list:
                if in_list.label == item.label:
                    return True
        else:
            raise Exception()

        return False

    def dict_contains_only_attr(self, data):
        # if its not a dict, then its not an
        # attribute
        if not isinstance(data, dict):
            return False

        data = data.copy()
        if len(data) == 0:
            return False
        attr_names = ["value", "unit"]
        attrs = [data.pop(name, None) for name in attr_names]

        return len(data) == 0

        # def compare_rel(
        #     x): return x.from_relations in item.from_relations.all()

        # def func(x): return any(filter(compare_rel, x.from_relations.all()))
        # any([func(x) for x in same_labels])

        # else:

    def identify_data_type(self, element):
        if element is None:
            return None

        def test_float(elm):
            assert ("." in elm), "does not contain decimal separator"
            return float(elm)

        def test_bool(elm):
            trues = ("true", "True")
            falses = ("false", "False")

            if elm in trues:
                return True
            elif elm in falses:
                return False
            else:
                raise ValueError("is not either true or false")

        def test_datetime(text):
            try:
                return dateutil.parser.parse(text)
            except:

                datetime_formats = (
                    '%Y-%m-%dT%H: %M: %SZ',  # strava
                )

                for fmt in datetime_formats:
                    try:
                        return datetime.strptime(text, fmt)
                    except ValueError as e:
                        pass

                raise ValueError('no valid date format found')

        # even though it is a string,
        # it might really be a int or float
        # so if string verify!!
        if isinstance(element, str):
            conv_functions = {
                float: test_float,
                int: lambda elm: int(elm),
                datetime: test_datetime,
                str: lambda elm: str(elm),
                bool: test_bool
            }

            order = [float, int, datetime, bool, str]

            for typ in order:
                try:
                    # try the converting function of that type
                    # if it doesnt fail, thats our type
                    return conv_functions[typ](element)
                except (ValueError, AssertionError) as e:
                    pass

            # if nothing else works, return as string
            return str(element)

        elif isinstance(element, (float, int, bool)):
            # otherwise just return the type of
            return element

    def _try_get_item(self, item, parrent_label=None):
        # standardize label string
        if hasattr(item, "label"):
            item.label = self.standardize_string(item.label)

        search_args = {}

        item_type = type(item)
        # meta vs instances
        if isinstance(item, (Attribute, Object, ObjectRelation, Schema)):
            search_args["label"] = item.label

        # instance only look for primary key
        elif isinstance(item, self.instances):
            # search_args["base__label"] = item.base.label
            search_args["pk"] = item.pk

        # individual metaobjects
        if isinstance(item, Attribute):
            search_args["object__label"] = item.object.label

        elif isinstance(item, Object):
            search_args["schema"] = item.schema
            # adds the option to search for objects dependent
            # on from relations
            if parrent_label and parrent_label == "None":
                search_args["from_relations"] = None
            elif parrent_label:
                search_args["from_relations__from_object__label"]\
                    = parrent_label
            # search_args["from_relations"] = item.from_relations

        elif isinstance(item, ObjectRelation):
            search_args["from_object"] = item.from_object
            search_args["to_object"] = item.to_object

        # # individual instances
        # elif isinstance(item, self.att_instances):
        #     search_args["object__base__label"] = item.object.base.label

        # elif isinstance(item, ObjectInstance):
        #     search_args["base__schema"] = item.base.schema
        #     # adds the option to search for objects dependent
        #     # on from relations
        #     if parrent_label and parrent_label == "None":
        #         search_args["from_relations"] = None
        #     elif parrent_label:
        #         search_args["from_relations__from_object__base__label"]\
        #             = parrent_label

        #     # search_args["from_relations"] = item.from_relations

        # elif isinstance(item, ObjectRelation):
        #     search_args["from_object"] = item.from_object
        #     search_args["to_object"] = item.to_object

        try:
            # this "with transaction.atomic():"
            # is used to make tests run due to some
            # random error, atomic something.
            # it works fine when runs normally

            with transaction.atomic():
                return item_type.objects.get(**search_args)

        except ObjectDoesNotExist as e:
            return None
        except MultipleObjectsReturned as e:
            self._error_list.append((item, e))
            if hasattr(item, "schema"):
                schema_label = item.schema.label
            else:
                schema_label = item.object.schema.label
            print("""Warning, this is most likely wrong wrong, the object
                found:  %s  objects, but the first was chosen.
                -- label: %s schema: %s""" % (
                item_type.objects.filter(**search_args).count(),
                item.label,
                schema_label,
            ))

            return item_type.objects.filter(**search_args).first()

        except Exception as e:
            pass

    def _try_create_item(self, item, update=False, parrent_label=None):

        item_type = type(item)
        remove_version = not isinstance(item_type, Schema)

        # test if exists
        item.label = self.standardize_string(item.label,
                                             remove_version=remove_version)
        try:
            # this "with transaction.atomic():"
            # is used to make tests run due to some
            # random error, atomic something.
            # it works fine when runs normally
            with transaction.atomic():

                # we want to be able to tell uniquely if this is the same
                # object, so test on all objects, not only the label,
                # so that it is possible to know if we are overwriting
                # objects that shouldnt
                return_item = self._try_get_item(item,
                                                 parrent_label=parrent_label)

                # the object exists,
                if return_item:
                    if update or self.overwrite_db_objects:
                        if self.save_to_db:
                            return_item.delete()
                            item.save()

                        else:
                            # if not updated, return the fetched one
                            item = return_item
                    else:
                        item = return_item
                # does not exists, create it!
                else:
                    try:
                        if self.save_to_db:
                            item.save()
                            self.added_meta_items.append(item)

                    except Exception as e:
                        # on update add to debug list
                        self._error_list.append((item, e))
                        return None

            # on success return the item, either fetched, or saved
            # so that the referenced object lives in the database
            self.touched_meta_items.append(item)
            return item

        except (transaction.TransactionManagementError, ) as e:
            return None
        except Exception as e:
            return None

    def is_objects_connected(self, obj_from, obj_to, objects):
        relations = obj_from.from_relations.all()

        related_objects = list(map(lambda x: x.to_object.get(), relations))

        related_objects = list(filter(lambda x: x in objects, related_objects))

        for obj in related_objects:
            if obj == obj_to:
                return True
            elif self.is_objects_connected(obj, obj_to, objects):
                return True

        return False

    def get_foaf_person(self):
        if not self.foaf_person:
            schema = Schema.objects.get(label="friend_of_a_friend")
            self.foaf_person = Object.objects.get(label="person",
                                                  schema=schema)
        return self.foaf_person

    def att_to_att_inst(self, attr):
        data_type = self.inverse_dict(Attribute.data_type_map, attr.data_type)

        return self.inverse_dict(self.att_inst_to_type_map, data_type)

    def get_connected_attribute_pairs(self, att_1, att_2):

        foaf1, att1_list = BaseMetaDataService.path_to_object(
            att_1, self.get_foaf_person(), childrens=[])

        foaf2, att2_list = BaseMetaDataService.path_to_object(
            att_2, self.get_foaf_person(), childrens=[])

        # get first common object
        common_set = set(att1_list) & set(att2_list)
        common_obj = next(filter(lambda x: x in common_set, att1_list))

        # truncate list down to common object
        att1_list = att1_list[:att1_list.index(common_obj) + 1]
        att2_list = att2_list[:att2_list.index(common_obj) + 1]

        returns = []

        common_instances = ObjectInstance.objects.filter(base=common_obj)
        for common_instance in common_instances:
            value1 = self.get_specific_child(common_instance,
                                             att_1,
                                             path=att1_list)
            value2 = self.get_specific_child(common_instance,
                                             att_2,
                                             path=att2_list)
            returns.append((value1, value2))

        return returns

    def get_specific_child(self, obj_inst, child, path=None):
        """
        get a decendent object, either att, or obj of a specific type
        """
        if path is None:
            path = self.path_to_object(child, obj_inst.base)

        search_args = self.build_search_args_from_list(path, obj_inst)
        AttributeInstance = self.att_to_att_inst(child)
        try:
            return AttributeInstance.objects.get(**search_args)
        except ObjectDoesNotExist as e:
            return None
        except MultipleObjectsReturned as e:
            print("WARNING: obj, contains multiple object, first is taken")
            return next(AttributeInstance.objects.filter(**search_args))

    def build_search_args_from_list(self, path, obj_inst):
        search_args = {}
        base_arg_name = "from_relations__from_object__"
        arg_name = ""
        # loop though all but last
        for obj in path:
            if isinstance(obj, Attribute):
                arg_name += "object__"
                search_args["base__label"] = obj.label
            elif obj == obj_inst.base:
                # last elm add primary key
                search_args[arg_name + "pk"] = obj_inst.pk
            else:
                # Not neccesary as long as pk is being added
                # search_args[arg_name + "base__label"] = obj.label
                arg_name += base_arg_name

        return search_args

    @staticmethod
    def path_to_object(obj, root_obj, childrens=[]):
        if isinstance(obj, Attribute):
            # add to path
            childrens.append(obj)
            obj = obj.object
        if obj == root_obj:
            return obj, childrens
        else:
            parrent_rels = obj.from_relations.all()
            childrens.append(obj)
            for parrent_rel in parrent_rels:
                parrent_obj = parrent_rel.from_object

                obj, childrens = BaseMetaDataService.path_to_object(
                    parrent_obj, root_obj, childrens=childrens)

                if obj == root_obj:
                    return obj, list(childrens)

            # this branch has been exhausted, return none
            return None, childrens
 def get_related_schema(self):
     schema = self._try_get_item(
         Schema(label=self.dataprovider.provider_name))
     return schema or self.create_new_empty_schema(
         self.dataprovider.provider_name)
Exemplo n.º 11
0
class RdfSchemaService(BaseRdfSchemaService):
    def __init__(self):
        super(RdfSchemaService, self).__init__()

    def export_schema_from_db(self, schema):
        g = Graph()
        # reset objects created (exported)
        self.touched_meta_items = []

        self.schema = schema

        # to know which have been exported
        self.touched_meta_items.append(self.schema)

        objects = Object.objects.filter(schema=self.schema)

        # to know which have been exported
        self.touched_meta_items.extend(objects)

        namespace = self.schema.url.replace(".ttl", "#")

        Ontology = Namespace(namespace)
        g.bind(schema.label, Ontology)

        rdf_schema = URIRef(Ontology)

        # define the ontology
        g.add((rdf_schema, RDF.type, OWL.Ontology))
        g.add((rdf_schema, DC.title, Literal(self.schema.label)))
        g.add((rdf_schema, DC.description, Literal(self.schema.description)))

        for obj in objects:
            # make sure that there is no space in the url, and the object is
            # unique
            obj_name = self.create_uri_ref(obj)

            # type
            g.add((obj_name, RDF.type, RDFS.Class))
            # description
            g.add((obj_name, RDFS.label, Literal(obj.label)))
            g.add((obj_name, RDFS.comment, Literal(obj.description)))
            # is defined by what schema
            g.add((obj_name, RDFS.isDefinedBy, rdf_schema))

            attributes = obj.attributes.all()
            # add attributes
            for attribute in attributes:

                # make sure that there is no space in the url
                attribute_name = self.create_uri_ref(attribute)

                g.add((attribute_name, RDF.type, RDF.Property))

                # this one relates the attribute to the object or domain
                g.add((attribute_name, RDFS.domain, obj_name))

                rdf_data_type = self.att_type_to_rdfs_uri(attribute.data_type)

                # data_type
                g.add((attribute_name, RDFS.range, rdf_data_type))

                # label and description
                g.add((attribute_name, RDFS.label, Literal(attribute.label)))
                g.add((attribute_name, RDFS.comment,
                       Literal(attribute.description)))
                # defined by
                g.add((attribute_name, RDFS.isDefinedBy, rdf_schema))

                # to know which have been exported
                self.touched_meta_items.extend(attributes)

        relations = ObjectRelation.objects.filter(schema=self.schema)
        # to know which have been exported
        self.touched_meta_items.extend(relations)

        for relation in relations:

            # the "R_" is to avoid naming conflict with classes
            relation_name = self.create_uri_ref(relation)

            # make sure that there is no space in the url

            from_object_name = self.create_uri_ref(relation.from_object)

            to_object_name = self.create_uri_ref(relation.to_object)

            # here a realtion object is created
            g.add((relation_name, RDF.type, RDF.Property))

            # a relation is a property with the domain of the object
            # and range another object
            # from_object
            g.add((relation_name, RDFS.domain, from_object_name))

            # to_object
            g.add((relation_name, RDFS.range, to_object_name))

            # label and description
            g.add((relation_name, RDFS.label, Literal(relation.label)))
            g.add((relation_name, RDFS.comment, Literal(relation.description)))
            # defined by
            g.add((relation_name, RDFS.isDefinedBy, rdf_schema))

        ttl_data = g.serialize(format='turtle')

        content = ContentFile(ttl_data)
        # schema.rdfs_file.delete()
        self.schema.rdfs_file.save(self.schema.label + ".ttl", content)

        self.schema.save()

        return self.schema

    def write_to_db_baseschema(self):
        # not very readable, consider to change to [_ for _ in _ ]
        graph_list = [
            self._create_graph_from_url(url) for url in self.default_list
        ]

        [self._create_schema_from_graph(g) for g in graph_list]

        [self._create_objects_from_graph(g) for g in graph_list]

        [self._create_object_references_from_graphV2(g) for g in graph_list]
        [self._create_attributes_from_graph(g) for g in graph_list]

    def read_objects_from_rdfs(self, rdf_url):
        self.save_to_db = False

        g = self._create_graph_from_url(rdf_url)

        self.schema = self._create_schema_from_graph(g)

        self._create_objects_from_graph(g)

        self._create_object_references_from_graphV2(g)

        self._create_attributes_from_graph(g)

        return self.touched_meta_items

    def write_to_db(self, rdf_url, overwrite=False):

        self.overwrite_db_objects = overwrite

        g = self._create_graph_from_url(rdf_url)

        self.schema = self._create_schema_from_graph(g)

        self._create_objects_from_graph(g)

        self._create_object_references_from_graphV2(g)

        self._create_attributes_from_graph(g)

    def _create_graph_from_url(self, rdf_url):
        g = Graph()

        # this is needed for the parser to be able to read files
        register(
            # 'text/rdf+n3', Parser,
            'text/plain',
            Parser,
            'rdflib.plugins.parsers.notation3',
            'N3Parser')

        # if not a string, its not an url
        # assume its a file
        if hasattr(rdf_url, 'read') or rdf_url[-4:] in [".ttl", ".xml"]:
            try:
                # default is n3 (ttl) if not xml (ttl is default)
                format = None if ".xml" in rdf_url else "n3"

                # make sure that the parser is reading from the beginning
                try:
                    rdf_url.seek(0)
                except:
                    pass

                g.parse(rdf_url, format=format)
                return g
            except Exception as e:
                raise Exception("could not load specified file as a graph.")

        # cant load from raw github  ttl if format is not set
        format = "n3" if ".ttl" in rdf_url else None

        if rdf_url in self.selfhosted:
            rdf_url = self.selfhosted[rdf_url]
        try:
            g.parse(rdf_url, format=format)
        except URLError as e:
            print("could not fetch schema from url: " + rdf_url)
            raise (e)
            return None
        return g

    def _validate_dependencies(self, g):

        missing_list = []
        # recursively try to loo
        for format, namespace in g.namespaces():
            namespace = str(namespace)

            if not self._validate_namespace(namespace):
                missing_list.append(namespace)

        return missing_list

    def _create_schema_from_graph(self, g):
        # identify schema attributes
        # TODO: identify rdf_url from graph instead
        # some cases where the url is not the same as the uri

        get_schema_keys = [OWL.Ontology, RDFS.Class, RDF.Property, None]

        rdf_url = None

        for key in get_schema_keys:
            try:
                schema_subject, _, _ = next(g.triples((None, RDF.type, key)))
                rdf_url = str(schema_subject)
                break
            except Exception as e:
                pass

        if rdf_url is None:
            return None

        label_keys = [DC.title, RDFS.label, DCTERMS.title]

        description_keys = [DC.description, DCTERMS.description]

        label = "Not available"
        description = "Not available"

        for key in label_keys:
            try:
                rdf_url, _, label = next(g.triples(
                    (schema_subject, key, None)))
                break
            except:
                pass

        for key in description_keys:
            try:
                rdf_url, _, description = next(
                    g.triples((schema_subject, key, None)))
                break
            except:
                pass

        # only save if it does not exists
        try:
            self.schema = Schema.objects.get(url=str(rdf_url))
        except Exception as e:
            self.schema = Schema(label=self.standardize_string(
                label, remove_version=True),
                                 url=str(rdf_url),
                                 description=str(description))
            if self.save_to_db:
                self.schema.save()
        return self.schema

    def _create_objects_from_graph(self, g):
        # now create all objects
        for s, p, o in g.triples((None, None, RDFS.Class)):
            _s = str(s)
            _o = str(o)
            _p = str(p)

            # mandatory
            try:
                # Property label
                label = next(g.triples((s, RDFS.label, None)))[2]
                # Property Class/domain
                schema_url = next(g.triples((s, RDFS.isDefinedBy, None)))[2]
            except Exception as e:
                continue

            # volentary
            try:
                # Property comment
                comment = next(g.triples((s, RDFS.comment, None)))[2]
            except:
                pass
            try:
                self.schema = Schema.objects.get(url=schema_url)
            except Exception as e:
                pass
            else:
                object = self._try_create_item(
                    Object(label=str(label),
                           description=str(comment),
                           schema=self.schema))

    def _create_object_references_from_graphV2(self, g):
        # object references is in fact rather a
        # property with range and domain pointing at
        # 2 objects

        # now create all object references
        for s, p, o in g.triples((None, None, RDF.Property)):

            # similar to property
            try:
                # Property label
                label = next(g.triples((s, RDFS.label, None)))[2]
                # Property comment
                try:
                    comment = next(g.triples((s, RDFS.comment, None)))[2]
                except:
                    comment = "could not find"

                # Property Class/domain
                domain = next(g.triples((s, RDFS.domain, None)))[2]
                # Property data_type
                o_range = next(g.triples((s, RDFS.range, None)))[2]

                # the purpose of this is just to identify
                # data properties to avoid database lookup
                # if not needed
                if o_range in self.valid_data_types:
                    pass

                from_schema_url, from_obj_label = self._split_rdfs_url(domain)
                to_schema_url, to_obj_label = self._split_rdfs_url(o_range)

                # get the schema so that we can select the object from the
                # right schema
                liste = Schema.objects.all()

                # get the right url from either current schema or from other
                # in db
                if self.schema.url == from_schema_url:
                    from_schema = self.schema
                else:
                    from_schema = Schema.objects.get(url=from_schema_url)
                if self.schema.url == to_schema_url:
                    to_schema = self.schema
                else:
                    to_schema = Schema.objects.get(url=to_schema_url)

                # standardize the labels to match what has been created
                from_obj_label = self.standardize_string(from_obj_label)
                to_obj_label = self.standardize_string(to_obj_label)

                # first try find the objects in the created list
                from_object = next(
                    filter(lambda x: x.label == from_obj_label,
                           self.touched_meta_items), None)

                to_object = next(
                    filter(lambda x: x.label == to_obj_label,
                           self.touched_meta_items), None)

                # TODO: consider the case if 2 objects has the same label?
                if not from_object:
                    from_object = Object.objects.filter(
                        label=from_obj_label, schema=from_schema).first()
                if not to_object:
                    to_object = Object.objects.filter(
                        label=to_obj_label, schema=to_schema).first()

            # if no such 2 objects exists
            except Exception as e:
                continue
            if from_object and to_object:
                object_relation = self._try_create_item(
                    ObjectRelation(from_object=from_object,
                                   to_object=to_object,
                                   label=label,
                                   schema=self.schema,
                                   description=comment))

    def _create_attributes_from_graph(self, g):

        for s, p, o in g.triples((None, None, RDF.Property)):
            _s = str(s)
            _o = str(o)
            _p = str(p)

            try:
                # Property label
                label = next(g.triples((s, RDFS.label, None)))[2]
                # Property comment
                comment = next(g.triples((s, RDFS.comment, None)))[2]
                # Property Class/domain
                domain = next(g.triples((s, RDFS.domain, None)))[2]
                # Property data_type
                range = next(g.triples((s, RDFS.range, None)))[2]

                _, obj_label = self._split_rdfs_url(domain)

                obj_label = self.standardize_string(obj_label)

                # find the object in the created list first
                object = next(
                    filter(lambda x: x.label == obj_label,
                           self.touched_meta_items), None)

                # if not found look in the database
                if object is None:
                    object = Object.objects.filter(label=obj_label).first()
            except Exception as e:
                continue

            if object is None:
                continue

            if range not in self.valid_data_types:
                continue

            attribute = self._try_create_item(
                Attribute(data_type=self.rdfs_to_att_type(range),
                          label=label,
                          object=object))

    def _validate_namespace(self, namespace):
        try:
            Schema.objects.Get(url=str(namespace))
        except Exception as e:
            return False
        return True

    def _split_rdfs_url(self, url):
        if not isinstance(url, (term.URIRef, URIRef)):
            return None

        methodlist = [
            lambda x: x.split("#"), lambda x:
            ("/".join(x.split("/")[:-1]) + "/", x.split("/")[-1])
        ]

        for method in methodlist:
            try:
                url, label = method(str(url))
                if label == "":
                    continue

                url += "/" if url[-1] != "/" else ""

                return url, label
            except Exception as e:
                pass
        return None