Пример #1
0
    def query(self, criteria, limit=20, offset=0):
        """
        The most general way to query based on a set of criteria.
        """
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        cursor = self.connection.cursor()
        if 'node' in criteria:
            query_forward = gin_jsonb_value(criteria, node_forward=True)
            query_backward = gin_jsonb_value(criteria, node_forward=False)
            cursor.execute(
                GIN_QUERY_2WAY,
                {
                    'query_forward': jsonify(query_forward),
                    'query_backward': jsonify(query_backward),
                    'limit': limit,
                    'offset': offset,
                },
            )
        else:
            query = gin_jsonb_value(criteria)
            cursor.execute(
                GIN_QUERY_1WAY,
                {'query': jsonify(query), 'limit': limit, 'offset': offset},
            )

        results = [
            transform_for_linked_data(data) for uri, data, weight in cursor.fetchall()
        ]
        return results
Пример #2
0
 def random_edges(self, limit=20):
     """
     Get a collection of distinct, randomly-selected edges.
     """
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute(RANDOM_QUERY, {'limit': limit})
     results = [
         transform_for_linked_data(data) for uri, data, weight in cursor.fetchall()
     ]
     return results
Пример #3
0
 def random_edges(self, limit=20):
     """
     Get a collection of distinct, randomly-selected edges.
     """
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute(RANDOM_QUERY, {'limit': limit})
     results = [
         transform_for_linked_data(data)
         for uri, data, weight in cursor.fetchall()
     ]
     return results
Пример #4
0
 def lookup_assertion(self, uri):
     """
     Get a single assertion, given its URI starting with /a/.
     """
     # Sanitize URIs to remove control characters such as \x00. The postgres driver would
     # remove \x00 anyway, but this avoids reporting a server error when that happens.
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri})
     results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()]
     return results
Пример #5
0
 def lookup_assertion(self, uri):
     """
     Get a single assertion, given its URI starting with /a/.
     """
     # Sanitize URIs to remove control characters such as \x00. The postgres driver would
     # remove \x00 anyway, but this avoids reporting a server error when that happens.
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri})
     results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()]
     return results
Пример #6
0
 def sample_dataset(self, uri, limit=50, offset=0):
     """
     Get a subsample of edges matching a particular dataset.
     """
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     dataset_json = json.dumps(uri)
     cursor.execute(DATASET_QUERY, {
         'dataset': dataset_json,
         'limit': limit,
         'offset': offset
     })
     results = [
         transform_for_linked_data(data) for uri, data in cursor.fetchall()
     ]
     return results
Пример #7
0
 def lookup(self, uri, limit=100, offset=0):
     """
     A query that returns all the edges that include a certain URI.
     """
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     if uri.startswith('/c/') or uri.startswith('http'):
         criteria = {'node': uri}
     elif uri.startswith('/r/'):
         criteria = {'rel': uri}
     elif uri.startswith('/s/'):
         criteria = {'source': uri}
     elif uri.startswith('/d/'):
         criteria = {'dataset': uri}
     elif uri.startswith('/a/'):
         return self.lookup_assertion(uri)
     else:
         raise ValueError("%r isn't a ConceptNet URI that can be looked up")
     return self.query(criteria, limit, offset)
Пример #8
0
 def lookup(self, uri, limit=100, offset=0):
     """
     A query that returns all the edges that include a certain URI.
     """
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     if uri.startswith('/c/') or uri.startswith('http'):
         criteria = {'node': uri}
     elif uri.startswith('/r/'):
         criteria = {'rel': uri}
     elif uri.startswith('/s/'):
         criteria = {'source': uri}
     elif uri.startswith('/d/'):
         criteria = {'dataset': uri}
     elif uri.startswith('/a/'):
         return self.lookup_assertion(uri)
     else:
         raise ValueError("%r isn't a ConceptNet URI that can be looked up")
     return self.query(criteria, limit, offset)
Пример #9
0
    def lookup_grouped_by_feature(self, uri, limit=20):
        """
        The query used by the browseable interface, which groups its results
        by what 'feature' they describe of the queried node.

        A feature is defined by the relation, the queried node, and the direction
        (incoming or outgoing).
        """
        uri = remove_control_chars(uri)
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        def extract_feature(row):
            return tuple(row[:2])

        def feature_data(row):
            direction, _, data = row

            # Hacky way to figure out what the 'other' node is, the one that
            # (in most cases) didn't match the URI. If both start with our
            # given URI, take the longer one, which is either a more specific
            # sense or a different, longer word.
            shorter, longer = sorted([data['start'], data['end']], key=len)
            if shorter.startswith(uri):
                data['other'] = longer
            else:
                data['other'] = shorter
            return data

        cursor = self.connection.cursor()
        cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit})
        results = {}
        for feature, rows in itertools.groupby(cursor.fetchall(),
                                               extract_feature):
            results[feature] = [
                transform_for_linked_data(feature_data(row)) for row in rows
            ]
        return results
Пример #10
0
    def lookup_grouped_by_feature(self, uri, limit=20):
        """
        The query used by the browseable interface, which groups its results
        by what 'feature' they describe of the queried node.

        A feature is defined by the relation, the queried node, and the direction
        (incoming or outgoing).
        """
        uri = remove_control_chars(uri)
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        def extract_feature(row):
            return tuple(row[:2])

        def feature_data(row):
            direction, _, data = row

            # Hacky way to figure out what the 'other' node is, the one that
            # (in most cases) didn't match the URI. If both start with our
            # given URI, take the longer one, which is either a more specific
            # sense or a different, longer word.
            shorter, longer = sorted([data['start'], data['end']], key=len)
            if shorter.startswith(uri):
                data['other'] = longer
            else:
                data['other'] = shorter
            return data

        cursor = self.connection.cursor()
        cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit})
        results = {}
        for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature):
            results[feature] = [
                transform_for_linked_data(feature_data(row)) for row in rows
            ]
        return results
Пример #11
0
    def query(self, criteria, limit=20, offset=0):
        """
        The most general way to query based on a set of criteria.
        """
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        cursor = self.connection.cursor()
        if 'node' in criteria:
            query_forward = gin_jsonb_value(criteria, node_forward=True)
            query_backward = gin_jsonb_value(criteria, node_forward=False)
            cursor.execute(
                GIN_QUERY_2WAY,
                {
                    'query_forward': jsonify(query_forward),
                    'query_backward': jsonify(query_backward),
                    'limit': limit,
                    'offset': offset,
                },
            )
        else:
            query = gin_jsonb_value(criteria)
            cursor.execute(
                GIN_QUERY_1WAY,
                {
                    'query': jsonify(query),
                    'limit': limit,
                    'offset': offset
                },
            )

        results = [
            transform_for_linked_data(data)
            for uri, data, weight in cursor.fetchall()
        ]
        return results
Пример #12
0
 def connection():
     # See https://www.psycopg.org/docs/connection.html#connection.closed
     if self._connection is None or self._connection.closed > 0:
         self._connection = get_db_connection(self.dbname)
     return self._connection
Пример #13
0
def complex_concept_load(N=6):
    connection = get_db_connection(None)
    cursor = connection.cursor()
    left_edges_query = '''
    select edges.start_id
    from edges
    group by edges.start_id
    having count(*)>''' + str(N) + ''';'''
    right_edges_query = '''
    select edges.end_id
    from edges
    group by edges.end_id
    having count(*)>''' + str(N) + ''';'''
    left_ids = set()
    right_ids = set()
    cursor.execute(left_edges_query)
    results = cursor.fetchall()
    for result in results:
        left_ids.add(result)
    cursor.execute(right_edges_query)
    results = cursor.fetchall()
    for result in results:
        right_ids.add(result)
    resset = left_ids.union(right_ids)
    print(len(resset))
    concept_uris = []
    print("Finding concept names")
    for concept in resset:
        conceptname_query = '''
        select uri 
        from nodes
        where nodes.id=''' + str(concept[0]) + ''';'''
        cursor.execute(conceptname_query)
        res = cursor.fetchall()
        concept_uris.append(res[0][0])
    print("Finding edges")
    the_ultimate_edge_list = []
    for idx, concept_uri in enumerate(resset):
        if idx % 10000 == 0:
            print(idx)
        relation_query = '''
        select relations.uri,s.uri,v.uri,t.weight  
        from 
        ((select distinct edges.id
        from edges
        where edges.weight>=1 and (edges.start_id=''' + str(
            concept_uri[0]) + ''' OR 
        edges.end_id=''' + str(concept_uri[0]) + ''')) uids
        inner join edges
        on edges.id=uids.id) t
        inner join relations on relations.id=t.relation_id
        inner join nodes as s on s.id = t.start_id
        inner join nodes as v on v.id = t.end_id
        ;'''
        cursor.execute(relation_query)
        res = cursor.fetchall()
        # print(len(res))
        the_ultimate_edge_list += res
    print("Dumping things.")
    pickle.dump(resset, open("resset.pickle", "wb"))
    pickle.dump(concept_uris, open("concept_uris.pickle.pickle", "wb"))
    pickle.dump(the_ultimate_edge_list, open("ultimate_edge_list.pickle",
                                             'wb'))
    print("Done")
    return resset