Exemplo n.º 1
0
class PropertySearchTestCase(unittest.TestCase):
    def setUp(self):
        self.propertySearch = PropertySearchDbpediaSparql()

    def testUriUri(self):
        s = "http://dbpedia.org/resource/Batman_&_Robin_(film)"
        o = "http://dbpedia.org/resource/Akiva_Goldsman"
        properties = self.propertySearch.search(s, o)
        propertyShouldExist = "http://dbpedia.org/ontology/writer"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )

    def testUriLiteral(self):
        s = "http://dbpedia.org/resource/Batman_&_Robin_(film)"
        o = "Batman & Robin"
        properties = self.propertySearch.search(s, o)
        propertyShouldExist = "http://xmlns.com/foaf/0.1/name"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )

        s = "http://dbpedia.org/resource/Batman_&_Robin_(film)"
        o = "English"
        properties = self.propertySearch.search(s, o)
        propertyShouldExist = "http://dbpedia.org/property/language"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )

    def testUriLiteralRegex(self):
        s = "http://dbpedia.org/resource/Austria"
        o = "101.4"
        properties = self.propertySearch.uriLiteralRegex(s, o)
        propertyShouldExist = "http://dbpedia.org/ontology/PopulatedPlace/populationDensity"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )

    def testUriLiteralPathRegex(self):
        s = "http://dbpedia.org/resource/Austria"
        o = "2004"
        properties = self.propertySearch.uriLiteralPathRegex(s, o)
        propertyShouldExist = "http://dbpedia.org/ontology/leader"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )

    def testLiteralUriReversePathRegex(self):
        s = "http://dbpedia.org/resource/Austria"
        o = "2004"
        properties = self.propertySearch.literalUriReversePathRegex(s, o)
        propertyShouldExist = "http://dbpedia.org/property/venue"
        self.assertTrue(
            propertyShouldExist in properties, msg="Property %s should be in %s" % (propertyShouldExist, properties)
        )
Exemplo n.º 2
0
    def findRelation(self, columnValue1, columnValue2, entities1, entities2):
        propertySearch = PropertySearchDbpediaSparql()
        properties = []

        if(len(entities1) > 0):
            for entity1 in entities1:
                properties.append(propertySearch.uriLiteralSearch(entity1,columnValue2))
        elif(len(entities2) > 0):
            for entity2 in entities2:
                properties.append(propertySearch.uriLiteralSearch(entity2,columnValue1))
        elif(len(entities1) > 0 and len(entities2) > 0):
            for entity1 in entities1:
                for entity2 in entities2:
                    properties.append(propertySearch.uriUriSearch(entity1, entity2))
        else:
            #both are literals, do nothing
            pass

        #flatten
        properties = [prop for sublist in properties for prop in sublist]
        #remove duplicates
        properties = list(set(properties))

        return properties
Exemplo n.º 3
0
 def setUp(self):
     self.propertySearch = PropertySearchDbpediaSparql()
Exemplo n.º 4
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistisIdentifier = AgdistisIdentifier()
     self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
     self.dbpediaSparql.setReturnFormat(JSON)
     self.propertySearch = PropertySearchDbpediaSparql()
Exemplo n.º 5
0
class SimplePropertyMapper(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistisIdentifier = AgdistisIdentifier()
        self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
        self.dbpediaSparql.setReturnFormat(JSON)
        self.propertySearch = PropertySearchDbpediaSparql()

    def parseResults(self, results, variableName="property"):
        """
            Refactor in a separate class
        """
        properties = []
        for result in results:
            properties.append(result[variableName]["value"])
        return properties

    def getClassForEntity(self, entity):
        """
            Refactor in a separate class
        """
        self.dbpediaSparql.setQuery(
            u"""
            SELECT DISTINCT ?class
            WHERE {
                <%s> a ?class .
            }
        """
            % (entity,)
        )
        results = self.dbpediaSparql.query().convert()["results"]["bindings"]
        return self.parseResults(results, variableName="class")

    def getEntities(self, tableId):
        entitiesCacheFile = os.path.join(cacheFolder, tableId + ".entities.cache")
        if os.path.exists(entitiesCacheFile):
            return pickle.load(open(entitiesCacheFile, "rb"))
        else:
            raise EntitiesDataStructureNotFound(
                "Entities data structure not available. Did you run subject column identification?"
            )

    def getEntitiesWithClasses(self, tableId):
        entities = self.getEntities(tableId)
        entitiesWithClassesCache = os.path.join(cacheFolder, tableId + ".entities.with.classes.cache")
        if os.path.exists(entitiesWithClassesCache):
            entities = pickle.load(open(entitiesWithClassesCache, "rb"))
        else:
            for rowIndex, entityRow in enumerate(entities):
                for columnIndex, entity in enumerate(entityRow):
                    for entityIndex, _entity in enumerate(entity):
                        entity[entityIndex] = (self.getClassForEntity(_entity), _entity)
            pickle.dump(entities, open(entitiesWithClassesCache, "wb"))
        return entities

    def getClasses(self, entities, numberOfColumns):
        classes = [[]] * numberOfColumns
        for rowIndex, entityRow in enumerate(entities):
            for columnIndex, entity in enumerate(entityRow):
                for entityIndex, _entity in enumerate(entity):
                    (_class, entityUrl) = _entity
                    try:
                        classes[columnIndex].append(_class)
                    except BaseException as e:
                        print "%s" % (str(e),)
        return classes

    def getMainClassForSubjectColumn(self, classes, subjectColumn):
        classesSubjectColumn = [item for sublist in classes[subjectColumn] for item in sublist]
        try:
            classCount = len(classesSubjectColumn)
            (mainClass, mainClassCount) = Counter(classesSubjectColumn).most_common(1)[0]
            mainClassScore = float(mainClassCount) / classCount * 100
        except IndexError:
            self.logger.debug("Main class could not be identified")
            mainClass = ""
        return mainClass

    def filterNonMainClassEntities(self, entities, mainClass, subjectColumn):
        for rowIndex, entityRow in enumerate(entities):
            for columnIndex, entity in enumerate(entityRow):
                if columnIndex != subjectColumn:
                    continue
                for entityIndex, _entity in enumerate(entity):
                    (_class, entityUrl) = _entity
                    if not mainClass in _class:
                        entities[rowIndex][columnIndex][entityIndex] = (None, None)
        return entities

    def findProperties(self, tableId, tableData, entities, subjectColumn, nonSubjectColumns):
        propertyCache = os.path.join(cacheFolder, tableId + ".property.star.cache")
        properties = collections.defaultdict(dict)
        if os.path.exists(propertyCache):
            properties = pickle.load(open(propertyCache, "rb"))
        else:
            for rowIndex, entityRow in enumerate(entities):
                for columnIndex, entity in enumerate(entityRow):
                    if columnIndex != subjectColumn:
                        continue
                    if len(entity) <= 0:
                        continue
                    for entityIndex, _entity in enumerate(entity):
                        (_class, entityUrl) = _entity
                        if entityUrl != None:
                            for nonSubjectColumn in nonSubjectColumns:
                                cellValue = tableData[rowIndex][nonSubjectColumn]
                                properties[rowIndex][nonSubjectColumn] = self.propertySearch.uriLiteralSearch(
                                    entityUrl, cellValue
                                )
            pickle.dump(properties, open(propertyCache, "wb"))
        return properties

    def aggregateProperties(self, properties, nonSubjectColumns):
        propertiesAggregate = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            propertiesAggregate[nonSubjectColumn] = []
        for row in properties:
            for nonSubjectColumn in nonSubjectColumns:
                propertiesAggregate[nonSubjectColumn].append(properties[row][nonSubjectColumn])

        for nonSubjectColumn in nonSubjectColumns:
            propertiesAggregate[nonSubjectColumn] = [
                item for sublist in propertiesAggregate[nonSubjectColumn] for item in sublist
            ]

        return propertiesAggregate

    def getTopProperties(self, propertiesAggregate, nonSubjectColumns, threshold):
        topProperties = []
        for nonSubjectColumn in nonSubjectColumns:
            try:
                (topProperty, support) = Counter(propertiesAggregate[nonSubjectColumn]).most_common(1)[0]
                # In percents
                support = (float(support) / len(propertiesAggregate[nonSubjectColumn])) * 100
                if support > threshold:
                    topProperties.append({"uri": topProperty, "columnIndex": nonSubjectColumn})
            except IndexError:
                self.logger.debug("No property identified for column %s" % (nonSubjectColumn))
        return topProperties

    def calculateScores(self, propertiesAggregate, nonSubjectColumns):
        scores = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            scores[nonSubjectColumn] = []

        for nonSubjectColumn in nonSubjectColumns:
            scores[nonSubjectColumn] = Counter(propertiesAggregate[nonSubjectColumn])

        return scores

    def getScores(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])
        subjectColumn = table.subjectColumn
        if subjectColumn == None or subjectColumn == -1:
            return []

        nonSubjectColumns = range(0, len(tableData[0]))
        nonSubjectColumns.remove(subjectColumn)

        self.logger.debug("Identifying properties for a table %s" % (tableId))

        entities = self.getEntitiesWithClasses(tableId)
        classes = self.getClasses(entities, numberOfColumns)
        mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn)
        entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn)
        properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns)
        propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns)
        propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns)

        return propertyScores

    def mapProperties(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])
        subjectColumn = table.subjectColumn
        if subjectColumn == None or subjectColumn == -1:
            return []

        nonSubjectColumns = range(0, len(tableData[0]))
        nonSubjectColumns.remove(subjectColumn)

        self.logger.debug("Identifying properties for a table %s" % (tableId))

        entities = self.getEntitiesWithClasses(tableId)
        classes = self.getClasses(entities, numberOfColumns)
        mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn)
        entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn)
        properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns)
        propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns)
        propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns)

        topProperties = self.getTopProperties(propertiesAggregate, nonSubjectColumns, threshold)

        return topProperties