示例#1
0
class PropertiesAll(RDFStatInterface):
    """count all properties"""
    def __init__(self, results):
        super(PropertiesAll, self).__init__(results)
        self.histogram = self.results['histogram'] = {}
        self.distinct = self.results['distinct'] = {}
        self.distinct_subject = self.results['distinct_subject'] = {}
        self.distinct_object = self.results['distinct_object'] = {}
        self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable

    def count(self, s, p, o, s_blank, o_l, o_blank, statement):
        # count all properties
        self.histogram[p] = self.histogram.get(p, 0) + 1
        # distinct
        spo = s+p+o
        if not dh.query_distinct_spo(spo, 0):
            dh.set_distinct_spo(spo, 0)
            self.distinct[p] = self.distinct.get(p, 0) + 1
        # per subject
        sp = s+p
        if len(sp) > 16:
            sp_hash = hashlib.md5(sp).digest()
        else:
            sp_hash = sp
        if not self.distinct_seen.has_key(sp_hash):
            self.distinct_seen[sp_hash] = 1
            self.distinct_subject[p] = self.distinct_subject.get(p, 0) + 1
        # per object
        po = p+o
        if len(po) > 16:
            po_hash = hashlib.md5(po).digest()
        else:
            po_hash = po
        if not self.distinct_seen.has_key(po_hash):
            self.distinct_seen[po_hash] = 1
            self.distinct_object[p] = self.distinct_object.get(p, 0) + 1
    
    def voidify(self, void_model, dataset):
        # count
        result_node = RDF.Node(literal=str(len(self.histogram)), datatype=ns_xs.integer.uri)
        void_model.append(RDF.Statement(dataset, ns_void.properties, result_node))
        # property partition
        for property_uri,result in self.distinct.iteritems():
            pr_id = RDF.Node()
            void_model.append(RDF.Statement(dataset, ns_void.propertyPartition, pr_id))
            void_model.append(RDF.Statement(pr_id, ns_void.property, RDF.Uri(property_uri)))
            result_node = RDF.Node(literal=str(result), datatype=ns_xs.integer.uri)
            void_model.append(RDF.Statement(pr_id, ns_void.triples, result_node))
            if self.distinct_subject.has_key(property_uri):
                s_result = self.distinct_subject[property_uri]
                result_node = RDF.Node(literal=str(s_result), datatype=ns_xs.integer.uri)
                void_model.append(RDF.Statement(pr_id, ns_void.distinctSubjects, result_node))
            if self.distinct_object.has_key(property_uri):
                o_result = self.distinct_object[property_uri]
                result_node = RDF.Node(literal=str(o_result), datatype=ns_xs.integer.uri)
                void_model.append(RDF.Statement(pr_id, ns_void.distinctObjects, result_node))
    
    def sparql(self, endpoint):
        pass
示例#2
0
 def __init__(self, results):
     super(PropertiesAll, self).__init__(results)
     self.histogram = self.results['histogram'] = {}
     self.distinct = self.results['distinct'] = {}
     self.distinct_subject = self.results['distinct_subject'] = {}
     self.distinct_object = self.results['distinct_object'] = {}
     self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable
示例#3
0
class PropertiesAll(RDFStatInterface):
    """count all properties"""
    def __init__(self, results):
        super(PropertiesAll, self).__init__(results)
        self.histogram = self.results['histogram'] = {}
        self.distinct = self.results['distinct'] = {}
        self.distinct_subject = self.results['distinct_subject'] = {}
        self.distinct_object = self.results['distinct_object'] = {}
        self.min_value = self.results['min_value'] = {}
        self.max_value = self.results['max_value'] = {}
        self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable

    def count(self, s, p, o, s_blank, o_l, o_blank, statement):
        # count all properties
        self.histogram[p] = self.histogram.get(p, 0) + 1
        # distinct
        spo = s+p+o
        if not dh.query_distinct_spo(spo, 0):
            dh.set_distinct_spo(spo, 0)
            self.distinct[p] = self.distinct.get(p, 0) + 1
        # per subject
        sp = s+p
        if len(sp) > 16:
            sp_hash = hashlib.md5(sp).digest()
        else:
            sp_hash = sp
        if not self.distinct_seen.has_key(sp_hash):
            self.distinct_seen[sp_hash] = 1
            self.distinct_subject[p] = self.distinct_subject.get(p, 0) + 1
            if o_l:
                value = None
                if str(statement.object.literal[2]) in [str(ns_xs.decimal), str(ns_xs.float), str(ns_xs.double)] or \
                   p in ['http://www.w3.org/2003/01/geo/wgs84_pos#long',
                         'http://www.w3.org/2003/01/geo/wgs84_pos#lat',
                         'http://www.w3.org/2003/01/geo/wgs84_pos#alt']:
                    value = float(o)
                elif str(statement.object.literal[2]) in [str(ns_xs.int), str(ns_xs.integer)]:
                    value = int(o)
                elif str(statement.object.literal[2]) in [str(ns_xs.dateTime), str(ns_xs.date)]:
                    value = o
                    
                if value is not None:
                    if self.min_value.has_key(p):
                        self.min_value[p] = min(self.min_value[p], value)
                    else:
                        self.min_value[p] = value

                    if self.max_value.has_key(p):
                        self.max_value[p] = max(self.max_value[p], value)
                    else:
                        self.max_value[p] = value

        # per object
        po = p+o
        if len(po) > 16:
            po_hash = hashlib.md5(po).digest()
        else:
            po_hash = po
        if not self.distinct_seen.has_key(po_hash):
            self.distinct_seen[po_hash] = 1
            self.distinct_object[p] = self.distinct_object.get(p, 0) + 1
    
    def voidify(self, void_model, dataset):
        # count
        result_node = RDF.Node(literal=str(len(self.histogram)), datatype=ns_xs.integer.uri)
        void_model.append(RDF.Statement(dataset, ns_void.properties, result_node))
        # property partition
        for property_uri,result in self.distinct.iteritems():
            pr_id = RDF.Node()
            void_model.append(RDF.Statement(dataset, ns_void.propertyPartition, pr_id))
            void_model.append(RDF.Statement(pr_id, ns_void.property, RDF.Uri(property_uri)))
            result_node = RDF.Node(literal=str(result), datatype=ns_xs.integer.uri)
            void_model.append(RDF.Statement(pr_id, ns_void.triples, result_node))
            if self.distinct_subject.has_key(property_uri):
                s_result = self.distinct_subject[property_uri]
                result_node = RDF.Node(literal=str(s_result), datatype=ns_xs.integer.uri)
                void_model.append(RDF.Statement(pr_id, ns_void.distinctSubjects, result_node))
            if self.distinct_object.has_key(property_uri):
                o_result = self.distinct_object[property_uri]
                result_node = RDF.Node(literal=str(o_result), datatype=ns_xs.integer.uri)
                void_model.append(RDF.Statement(pr_id, ns_void.distinctObjects, result_node))
                
            if self.min_value.has_key(property_uri):
                min_value = self.min_value[property_uri]
                if isinstance(min_value, float):
                    result_node = RDF.Node(literal=str(min_value), datatype=ns_xs.decimal.uri)
                elif isinstance(min_value, int):
                    result_node = RDF.Node(literal=str(min_value), datatype=ns_xs.integer.uri)
                elif isinstance(min_value, str):
                    result_node = RDF.Node(literal=min_value, datatype=ns_xs.dateTime.uri)
                void_model.append(RDF.Statement(pr_id, ns_dstats.minValue, result_node))
            if self.max_value.has_key(property_uri):
                max_value = self.max_value[property_uri]
                if isinstance(max_value, float):
                    result_node = RDF.Node(literal=str(max_value), datatype=ns_xs.decimal.uri)
                elif isinstance(max_value, int):
                    result_node = RDF.Node(literal=str(max_value), datatype=ns_xs.integer.uri)
                elif isinstance(max_value, str):
                    result_node = RDF.Node(literal=max_value, datatype=ns_xs.dateTime.uri)
                void_model.append(RDF.Statement(pr_id, ns_dstats.maxValue, result_node))
    
    def sparql(self, endpoint):
        pass
示例#4
0
LODStats is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with LODStats.  If not, see <http://www.gnu.org/licenses/>.
"""
import bitarray
from hashlib import md5
from LimitedSizeDict import LimitedSizeDict

# FIXME: does it help to build some small "md5-cache" for the last 1, 2, 3 strings?!

# subjects
distinct_subjects = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable
# 0: entities, 1: typed subjects, 2: labeled subjects

def query_distinct_subject(s, num_id):
    if len(s) > 16:
        s_hash = md5(s).digest()
    else:
        s_hash = s
    if distinct_subjects.has_key(s_hash):
        return distinct_subjects[s_hash][num_id]
    else:
        return False
        
def set_distinct_subject(s, num_id):
    if len(s) > 16:
        s_hash = md5(s).digest()