示例#1
0
def evaluate_subgraph_extraction(nhops,
                                 e_field,
                                 p_field,
                                 limit=None,
                                 show_errors=False):
    '''
    e_field, p_field <str> names of the fields in MongoDB to look up the IDs
    '''
    samples = mongo.get_sample(limit=limit)
    # iterate over the cursor
    accs = []
    for doc in samples:
        # get correct entities and predicates from the GS annotations
        e_ids = doc[e_field]
        p_uris = doc[p_field]

        # extract the subgraph
        kg = HDTDocument(hdt_path + hdt_file)
        kg.configure_hops(nhops, p_uris, namespace, True)
        entities, _, _ = kg.compute_hops(e_ids)
        kg.remove()

        # check if we hit the answer set
        if 'answers_ids' in doc:
            correct_answers_ids = set(doc['answers_ids'])
            #         print(correct_answers_ids)
            n_hits = len(correct_answers_ids & set(entities))
            # accuracy
            acc = float(n_hits) / len(correct_answers_ids)
            accs.append(acc)
            if show_errors & (acc < 1):
                print(doc['question'])
                print(doc['entity_ids'])
                print(doc['predicate_uris'])
    return accs
示例#2
0
文件: test.py 项目: svakulenk0/KBQA
def hop(activations,
        constraints,
        predicates_ids,
        verbose=False,
        _bool_answer=False,
        max_triples=500000):
    # extract the subgraph for the selected entities
    top_entities_ids = [_id for e in activations + constraints for _id in e]
    # exclude types predicate
    top_predicates_ids = [
        _id for p in predicates_ids for _id in p if _id != 68655
    ]

    # iteratively call the HDT API to retrieve all subgraph partitions
    activations = defaultdict(int)
    offset = 0
    while True:
        # get the subgraph for selected predicates only
        kg = HDTDocument(hdt_path + hdt_file)
        kg.configure_hops(1, top_predicates_ids, namespace, True)
        entities, predicate_ids, adjacencies = kg.compute_hops(
            top_entities_ids, max_triples, offset)
        kg.remove()

        if not entities:
            # filter out the answers by min activation scores
            if not _bool_answer and constraints:
                # normalize activations by checking the 'must' constraints: number of constraints * weights
                min_a = len(constraints) * 1
                if predicates_ids != top_predicates_ids:
                    min_a -= 1
            else:
                min_a = 0
            # return HDT ids of the activated entities
            return [
                a_id for a_id, a_score in activations.items()
                if a_score > min_a
            ]

        if verbose:
            print("Subgraph extracted:")
            print("%d entities" % len(entities))
            print("%d predicates" % len(predicate_ids))
            print("Loading adjacencies..")

        offset += max_triples
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entities)}
        adj_shape = (len(entities), len(entities))
        # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops
        A = generate_adj_sp(adjacencies, adj_shape, include_inverse=True)

        # activations of entities and predicates
        e_ids = [
            entities_dict[entity_id] for entity_id in top_entities_ids
            if entity_id in entities_dict
        ]
        #     assert len(top_entities_ids) == len(e_ids)
        p_ids = [
            predicate_ids.index(entity_id) for entity_id in top_predicates_ids
            if entity_id in predicate_ids
        ]
        #     assert len(top_predicates_ids) == len(p_ids)
        if p_ids:
            # graph activation vectors
            x = np.zeros(len(entities))
            x[e_ids] = 1
            p = np.zeros(len(predicate_ids))
            p[p_ids] = 1

            # slice A by the selected predicates and concatenate edge lists
            y = (x @ sp.hstack(A * p)).reshape(
                [len(predicate_ids), len(entities)]).sum(0)
            # check output size
            assert y.shape[0] == len(entities)

            # harvest activations
            top = np.argwhere(y > 0).T.tolist()[0]
            if len(top) > 0:
                activations1 = np.asarray(entities)[top]
                # store the activation values per id answer id
                for i, e in enumerate(entities):
                    if e in activations1:
                        activations[e] += y[i]
示例#3
0
    top_properties_ids = list(
        set([
            e_candidate['uri'] for e in top_properties.values()
            for e_candidate in e
        ]))
    top_p_scores = {
        e_candidate['id']: e_candidate['score']
        for e in top_properties.values() for e_candidate in e
    }
    n_e_activations = len(top_entities_ids)
    n_p_activations = len(top_properties_ids)

    # extract the subgraph
    kg = HDTDocument(hdt_path + hdt_file)
    kg.configure_hops(nhops, top_properties_ids, namespace, True)
    entities, predicate_ids, adjacencies = kg.compute_hops(top_entities_ids)
    kg.remove()

    if not max_x:
        max_x = len(entities)
    if not max_p:
        max_p = len(predicate_ids)

    # check if we hit the answer set
    correct_answers_ids = set(doc['answers_ids'])
    n_gs_answers = len(correct_answers_ids)
    n_hits = len(correct_answers_ids & set(entities))
    # accuracy
    acc = float(n_hits) / len(correct_answers_ids)
    accs.append(acc)
    # pick only the samples where we find the correct subgraph
示例#4
0
class KBQA():
    def __init__(self, dataset_name='lcquad'):
        '''
        Setup models, indices, embeddings and connection to the KG through the HDT API
        '''
        
        # connect to the entity and predicate catalogs
        self.e_index = IndexSearch('dbpedia201604e')
        self.p_index = IndexSearch('dbpedia201604p')

        # load embeddings
        self.word_vectors = load_embeddings(embeddings_path, embeddings_choice)
        self.p_vectors = load_embeddings(embeddings_path, 'fasttext_p_labels')
        
        # load pre-trained question type classification model
        with open(model_path+'qtype_lcquad_%s.pkl'%(embeddings_choice), 'rb') as f:
            self.model_settings = pkl.load(f)
        self.qt_model = build_qt_inference_model(self.model_settings)
        self.qt_model.load_weights(model_path+'_qtype_weights.best.hdf5', by_name=True)

        # load pre-trained question parsing model
        with open(model_path+'lcquad_%s.pkl'%(embeddings_choice), 'rb') as f:
            ep_model_settings = pkl.load(f)
        self.ep_model = build_ep_inference_model(ep_model_settings)
        # load weights
        # ep_model.load_weights('checkpoints/_'+modelname+'_weights.best.hdf5', by_name=True)
        self.ep_model.load_weights(model_path+'2hops-types.h5', by_name=True)

        # connect to the knowledge graph hdt file
        self.kg = HDTDocument(hdt_path+hdt_file)

    # functions for entity linking and relation detection
    def entity_linking(self, e_spans, verbose=False, cutoff=500, threshold=0): 
        guessed_ids = []
        for span in e_spans:
            span_ids = self.e_index.label_scores(span, top=cutoff, threshold=threshold, verbose=verbose, scale=0.3, max_degree=50000)
            guessed_ids.append(span_ids)
        return guessed_ids

    def relation_detection(self, p_spans, verbose=False, cutoff=500, threshold=0.0): 
        guessed_ids = []
        for span in p_spans:
            span_ids = {}
            guessed_labels = []
            if span in self.p_vectors:
                guessed_labels.append([span, 1])
            for p, score in self.p_vectors.most_similar(span, topn=cutoff):
                if score >= threshold:
                    guessed_labels.append([p, score])
            for label, score in guessed_labels:
                for match in self.p_index.look_up_by_label(label):
                    _id = match['_source']['id']
                    span_ids[_id] = score
                    if verbose:
                        uri = match['_source']['uri']
                        print(uri)
                        print(score)
            guessed_ids.append(span_ids)
        return guessed_ids

    # MP functions
    def generate_adj_sp(self, adjacencies, n_entities, include_inverse):
        '''
        Build adjacency matrix
        '''
        adj_shape = (n_entities, n_entities)
        # colect all predicate matrices separately into a list
        sp_adjacencies = []

        for edges in adjacencies:
            # split subject (row) and object (col) node URIs
            n_edges = len(edges)
            row, col = np.transpose(edges)
            
            # duplicate edges in the opposite direction
            if include_inverse:
                _row = np.hstack([row, col])
                col = np.hstack([col, row])
                row = _row
                n_edges *= 2
            
            # create adjacency matrix for this predicate
            data = np.ones(n_edges)
            adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
            sp_adjacencies.append(adj)
        
        return np.asarray(sp_adjacencies)

    def hop(self, entities, constraints, top_predicates, verbose=False, max_triples=500000, bl_p=[68655]):
        '''
        Extract the subgraph for the selected entities
        bl_p  -- the list of predicates to ignore (e.g. type predicate is too expensive to expand)
        ''' 
    #     print(top_predicates)
        n_constraints = len(constraints)
        if entities:
            n_constraints += 1

        top_entities = entities + constraints
        all_entities_ids = [_id for e in top_entities for _id in e]
        top_predicates_ids = [_id for p in top_predicates for _id in p if _id not in bl_p]

        # iteratively call the HDT API to retrieve all subgraph partitions
        activations = defaultdict(int)
        offset = 0

        while True:
            # get the subgraph for selected predicates only
    #         print(top_predicates_ids)
            self.kg.configure_hops(1, top_predicates_ids, namespace, True)
            entities, predicate_ids, adjacencies = self.kg.compute_hops(all_entities_ids, max_triples, offset)
    #         print(adjacencies)
            # show subgraph entities
    #         print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities])
            
            if not entities:
                answers = [{a_id: a_score} for a_id, a_score in activations.items()]
                return answers

            if verbose:
                print("Subgraph extracted:")
                print("%d entities"%len(entities))
                print("%d predicates"%len(predicate_ids))
                print("Loading adjacencies..")

            offset += max_triples
            # index entity ids global -> local
            entities_dict = {k: v for v, k in enumerate(entities)}
            # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops
            A = self.generate_adj_sp(adjacencies, len(entities), include_inverse=True)
    #         print(predicate_ids)
            # activate entities -- build sparse matrix
            row, col, data = [], [], []
            for i, concept_ids in enumerate(top_entities):
                for entity_id, score in concept_ids.items():
                    if entity_id in entities_dict:
    #                     print(e_index.look_up_by_id(entity_id)[0]['_source']['uri'])
    #                     print(score)
                        local_id = entities_dict[entity_id]
                        row.append(i)
                        col.append(local_id)
                        data.append(score)
            x = sp.csr_matrix((data, (row, col)), shape=(len(top_entities), len(entities)))
        
            # iterate over predicates
            ye = sp.csr_matrix((len(top_entities), len(entities)))
            # activate predicates
            if top_predicates_ids:
                yp = sp.csr_matrix((len(top_predicates), len(entities)))
                for i, concept_ids in enumerate(top_predicates):
                    # activate predicates
                    p = np.zeros([len(predicate_ids)])
                    # iterate over synonyms
                    for p_id, score in concept_ids.items():
                        if p_id in predicate_ids:
                            local_id = predicate_ids.index(p_id)
                            p[local_id] = score
                    # slice A by the selected predicates
                    _A = sum(p*A)
                    _y = x @ _A
                    # normalize: cut top to 1
                    _y[_y > 1] = 1
                    yp[i] = _y.sum(0)
                    ye += _y
                y = sp.vstack([ye,yp])
            # fall back to evaluate all predicates
            else:
                y = x @ sum(A)
            sum_a = sum(y)
            sum_a_norm = sum_a.toarray()[0] / (len(top_predicates) + n_constraints) #normalize(sum_a, norm='max', axis=1).toarray()[0]
            # normalize: cut top to 1
            sum_a_norm[sum_a_norm > 1] = 1
            # activations across components
            y_counts = binarize(y, threshold=0.0)
            count_a = sum(y_counts).toarray()[0]
            # final scores
            y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1)

            # check output size
            assert y.shape[0] == len(entities)

            top = np.argwhere(y > 0).T.tolist()[0]
            if len(top) > 0:
                activations1 = np.asarray(entities)[top]
                # store the activation values per id answer id
                for i, e in enumerate(entities):
                    if e in activations1:
                        activations[e] += y[i]
            # if not such answer found fall back to return the answers satisfying max of the constraints
            else:
                # select answers that satisfy maximum number of constraints
                y_p = np.argmax(y)
                # maximum number of satisfied constraints
                max_cs = y[y_p]
                # at least some activation (evidence from min one constraint)
                if max_cs != 0:
                    # select answers
                    top = np.argwhere(y == max_cs).T.tolist()[0]
                    activations1 = np.asarray(entities)[top]
                    # store the activation values per id answer id
                    for i, e in enumerate(entities):
                        if e in activations1:
                            activations[e] += y[i]

    def request(self, question, top_n=3, verbose=False):
        # parse question into words and embed
        x_test_sent = np.zeros((self.model_settings['max_len'], self.model_settings['emb_dim']))
        q_words = text_to_word_sequence(question)
        for i, word in enumerate(q_words):
            x_test_sent[i] = self.word_vectors.query(word)

        # predict question type
        if verbose:
            print(x_test_sent)
        y_p = self.qt_model.predict(np.array([x_test_sent]))
        y_p = np.argmax(y_p, axis=-1)[0]
        p_qt = question_types[y_p]
        ask_question = p_qt == 'ASK'
        print(p_qt)

        # use GS spans + preprocess
        y_p = self.ep_model.predict(np.array([x_test_sent]))
        y_p = np.argmax(y_p, axis=-1)[0]
        e_spans1 = collect_mentions(q_words, y_p, 1)
        p_spans1 = collect_mentions(q_words, y_p, 2)
        p_spans2 = collect_mentions(q_words, y_p, 3)

        #         c_spans1 = doc['c1_spans']
        #         c_spans2 = doc['c2_spans']

        # match predicates
        top_predicates_ids1 = self.relation_detection(p_spans1, threshold=0)
        top_predicates_ids2 = self.relation_detection(p_spans2, threshold=0)

        # use GS classes
        #         classes1 = [{_id: 1} for _id in doc['classes_ids'] if _id in doc['1hop_ids'][0]]
        #         classes2 = [{_id: 1} for _id in doc['classes_ids'] if _id in doc['2hop_ids'][0]]

        top_entities_ids1 = self.entity_linking(e_spans1, threshold=0.7)

        if ask_question:
            a_threshold = 0.0
        else:
            a_threshold = 0.5

        # MP
        answers_ids = []

        # 1st hop
        answers_ids1 = self.hop([], top_entities_ids1, top_predicates_ids1, verbose)
        #         if classes1:
        #             answers_ids1 = filter_answer_by_class(classes1, answers_ids1)
        answers1 = [{a_id: a_score} for activations in answers_ids1 for a_id, a_score in activations.items() if a_score > a_threshold]

        # 2nd hop
        if top_predicates_ids1 and top_predicates_ids2:                
            answers_ids = self.hop(answers1, [], top_predicates_ids2, verbose)
        #             if classes2:
        #                 answers_ids = filter_answer_by_class(classes2, answers_ids)
            answers = [{a_id: a_score} for activations in answers_ids for a_id, a_score in activations.items() if a_score > a_threshold]
        else:
            answers = answers1

        answers_ids = [_id for a in answers for _id in a]


        # show spans
        print(e_spans1)
        print(p_spans1)
        print(p_spans2)

        # show  matches
        print([{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_entities_ids1 for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n])
        print([{self.p_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_predicates_ids1 for _id, score in answer.items() if self.p_index.look_up_by_id(_id)][:top_n])
        print([{self.p_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_predicates_ids2 for _id, score in answer.items() if self.p_index.look_up_by_id(_id)][:top_n])


        # show intermediate answers if there was a second hop
        if top_predicates_ids2:
            print([{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in answers1 for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n])


        if ask_question:
            # make sure the output matches every input basket
            all_entities_baskets = [set(e.keys()) for e in top_entities_ids1]
            answers = all(x & set(answers_ids) for x in all_entities_baskets)
        else:
            # show answers
            answers = [{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in answers for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n]
        
        if verbose:
            print(answers)

        return answers

    def test_request(self):
        question = "What are some other works of the author of The Phantom of the Opera?"
        self.request(question, verbose=True)
示例#5
0
文件: EL.py 项目: svakulenk0/KBQA
def entity_linking(spans_field,
                   save,
                   show_errors=True,
                   add_nieghbours=True,
                   lookup_embeddings=True):
    # iterate over the cursor
    cursor = mongo.get_sample(limit=limit)
    count = 0
    # hold macro-average stats for the model performance over the samples
    ps, rs, fs = [], [], []
    with cursor:
        for doc in cursor:
            # if 'entity_ids_guess' not in doc:
            correct_uris = doc['entity_uris']
            print(set(correct_uris))
            # get entity spans
            e_spans = doc[spans_field]
            #         e_spans = doc[spans_field+'_guess']
            #     print(e_spans)
            # get entity matches TODO save scores
            top_ids = []
            top_entities = {}
            for span in e_spans:
                print("Span: %s" % span)
                print("Index lookup..")
                guessed_labels, guessed_ids, look_up_ids = [], [], []
                for match in e_index.match_label(span, top=string_cutoff):
                    label = match['_source']['label_exact']
                    degree = match['_source']['count']
                    #                 print(degree)
                    _id = match['_source']['id']
                    # avoid expanding heavy hitters
                    if int(degree) < max_degree:
                        look_up_ids.append(_id)
                    guessed_ids.append(_id)
                    if label not in guessed_labels:
                        guessed_labels.append(label)
                    uri = match['_source']['uri']
    #                 print(uri)

                print("%d candidate labels" % len(guessed_labels))
                if add_nieghbours:
                    print("KG lookup..")
                    kg = HDTDocument(hdt_path + hdt_file)
                    kg.configure_hops(1, [], namespace, True)
                    # get a sample of the subgraph: the first <max_triples> only
                    entities, predicate_ids, adjacencies = kg.compute_hops(
                        look_up_ids, max_triples, 0)
                    kg.remove()
                    # look up labels
                    for e_id in entities:
                        match = e_index.look_up_by_id(e_id)
                        if match:
                            label = match[0]['_source']['label_exact']
                            if label not in guessed_labels:
                                guessed_labels.append(label)
                    guessed_ids.extend(entities)

                # score with embeddings
                guessed_labels = [
                    label for label in guessed_labels if label in e_vectors
                ]
                print("%d candidate labels" % len(guessed_labels))
                if guessed_labels and lookup_embeddings:
                    print("Embeddings lookup..")
                    dists = e_vectors.distance(span, [
                        label for label in guessed_labels if label in e_vectors
                    ])
                    top = np.argsort(dists)[:semantic_cutoff].tolist()
                    top_labels = [guessed_labels[i] for i in top]
                    print("selected labels: %s" % top_labels)
                    print("Index lookup..")
                    top_entities[span] = []
                    for i, label in enumerate(top_labels):
                        print(label)
                        for match in e_index.look_up_by_label(label):
                            distance = float(dists[top[i]])
                            degree = match['_source']['count']
                            _id = match['_source']['id']
                            uri = match['_source']['uri']
                            print(uri)
                            top_entities[span].append({
                                'rank': i + 1,
                                'distance': distance,
                                'degree': degree,
                                'id': _id,
                                'uri': uri
                            })
                            top_ids.append(_id)
                else:
                    top_labels = guessed_labels
                    top_ids.extend(guessed_ids)

            # evaluate against the correct entity ids
            top_ids = list(set(top_ids))
            correct_ids = set(doc['entity_ids'])
            n_hits = len(correct_ids & set(top_ids))
            try:
                r = float(n_hits) / len(correct_ids)
            except ZeroDivisionError:                \
                                print(doc['question'])
            try:
                p = float(n_hits) / len(top_ids)
            except ZeroDivisionError:
                p = 0
            try:
                f = 2 * p * r / (p + r)
            except ZeroDivisionError:
                f = 0
            print("P: %.2f R: %.2f F: %.2f" % (p, r, f))

            # add stats
            ps.append(p)
            rs.append(r)
            fs.append(f)

            # save to MongoDB
            if save:
                doc['entity_ids_guess'] = top_ids
                doc['entity_guess'] = top_entities
                mongo.col.update_one({'_id': doc['_id']}, {"$set": doc},
                                     upsert=True)
                count += 1

    print("P: %.2f R: %.2f F: %.2f" % (np.mean(ps), np.mean(rs), np.mean(fs)))
    print("Fin. Results for %d questions" % len(ps))
    if save:
        print("%d documents annotated with entity ids guess" % count)
示例#6
0
def hop(entities,
        constraints,
        top_predicates,
        verbose=False,
        max_triples=500000):
    '''
    Extract the subgraph for the selected entities
    '''
    #     print(top_predicates)
    n_constraints = len(constraints)
    if entities:
        n_constraints += 1

    top_entities = entities + constraints
    all_entities_ids = [_id for e in top_entities for _id in e]
    top_predicates_ids = [_id for p in top_predicates for _id in p if _id]

    # iteratively call the HDT API to retrieve all subgraph partitions
    activations = defaultdict(int)
    offset = 0

    while True:
        # get the subgraph for selected predicates only
        kg = HDTDocument(hdt_path + hdt_file)
        #         print(top_predicates_ids)
        kg.configure_hops(1, top_predicates_ids, namespace, True)
        entities, predicate_ids, adjacencies = kg.compute_hops(
            all_entities_ids, max_triples, offset)
        kg.remove()
        #         print(adjacencies)
        # show subgraph entities
        #         print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities])

        if not entities:
            answers = [{
                a_id: a_score
            } for a_id, a_score in activations.items()]
            return answers

        # if verbose:
        # print("Subgraph extracted:")
        # print("%d entities"%len(entities))
        # print("%d predicates"%len(predicate_ids))
        # print("Loading adjacencies..")

        offset += max_triples
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entities)}
        # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops
        A = generate_adj_sp(adjacencies, len(entities), include_inverse=True)
        #         print(predicate_ids)
        # activate entities -- build sparse matrix
        row, col, data = [], [], []
        for i, concept_ids in enumerate(top_entities):
            for entity_id, score in concept_ids.items():
                if entity_id in entities_dict:
                    #                     print(e_index.look_up_by_id(entity_id)[0]['_source']['uri'])
                    #                     print(score)
                    local_id = entities_dict[entity_id]
                    row.append(i)
                    col.append(local_id)
                    data.append(score)
        x = sp.csr_matrix((data, (row, col)),
                          shape=(len(top_entities), len(entities)))

        # iterate over predicates
        ye = sp.csr_matrix((len(top_entities), len(entities)))
        # activate predicates
        if top_predicates_ids:
            yp = sp.csr_matrix((len(top_predicates), len(entities)))
            for i, concept_ids in enumerate(top_predicates):
                # activate predicates
                p = np.zeros([len(predicate_ids)])
                # iterate over synonyms
                for p_id, score in concept_ids.items():
                    if p_id in predicate_ids:
                        local_id = predicate_ids.index(p_id)
                        p[local_id] = score
                # slice A by the selected predicates
                _A = sum(p * A)
                _y = x @ _A
                # normalize: cut top to 1
                _y[_y > 1] = 1
                yp[i] = _y.sum(0)
                ye += _y
            y = sp.vstack([ye, yp])
        # fall back to evaluate all predicates
        else:
            y = x @ sum(A)
        sum_a = sum(y)
        sum_a_norm = sum_a.toarray()[0] / (
            len(top_predicates) + n_constraints
        )  #normalize(sum_a, norm='max', axis=1).toarray()[0]
        # normalize: cut top to 1
        sum_a_norm[sum_a_norm > 1] = 1
        # activations across components
        y_counts = binarize(y, threshold=0.0)
        count_a = sum(y_counts).toarray()[0]
        # final scores
        y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1)

        # check output size
        assert y.shape[0] == len(entities)

        top = np.argwhere(y > 0).T.tolist()[0]
        if len(top) > 0:
            activations1 = np.asarray(entities)[top]
            # store the activation values per id answer id
            for i, e in enumerate(entities):
                if e in activations1:
                    activations[e] += y[i]
        # if not such answer found fall back to return the answers satisfying max of the constraints
        else:
            # select answers that satisfy maximum number of constraints
            y_p = np.argmax(y)
            # maximum number of satisfied constraints
            max_cs = y[y_p]
            # at least some activation (evidence from min one constraint)
            if max_cs != 0:
                # select answers
                top = np.argwhere(y == max_cs).T.tolist()[0]
                activations1 = np.asarray(entities)[top]
                # store the activation values per id answer id
                for i, e in enumerate(entities):
                    if e in activations1:
                        activations[e] += y[i]
示例#7
0
    def forward(self,
                e_scores,
                entity_ids,
                p_scores,
                answer=None,
                all_predicate_ids=all_predicate_ids):
        '''
        Inputs:
            *e_scores*: entity scores from Transformer
            *entity_ids*: global entity ids to request the KG for adjacencies
            *p_scores*: predicate scores from Transformer
        Outputs:
            *subgraph*: subgraph edges and entities
        '''
        #         with torch.autograd.detect_anomaly():
        # get the top-k (predicates/)entities based on the score vectors
        weights, indices = torch.sort(e_scores.view(-1), descending=True)
        sampled_entities = entity_ids[
            indices[:self.top_e]].tolist()  # choose top-k matching entities
        #         print("Retrieving adjacencies for %d entities"%len(sampled_entities))
        # sample predicates?
        sampled_predicates = []  # predicate_ids.tolist()
        #         weights, indices = torch.sort(p_scores.view(-1), descending=True)
        #         sampled_predicates = predicate_ids[indices[:self.top_p]].tolist()

        with torch.no_grad():

            # initialise connection to the Wikidata KG through the HDT API
            kg = HDTDocument(self.hdt_path)
            # request kg through hdt api for a subgraph given entity and relation subsets
            kg.configure_hops(1, sampled_predicates,
                              'predef-wikidata2018-09-all', True, False)
            s_entity_ids, s_predicate_ids, adjacencies = kg.compute_hops(
                sampled_entities, 5000, 0)
            kg.remove()
            del kg
            #         print("Retrieved new subgraph with %d entities and %d relations" % (len(s_entity_ids), len(s_predicate_ids)))

            # check subgraph exists
            if not s_entity_ids:
                return (), None

            # check we are in the right subgraph
            if answer is not None and answer not in s_entity_ids:
                return (), None

            # build a lookup table for entity & predicate scores
            e_table = build_look_up(entity_ids)
            p_table = build_look_up(all_predicate_ids)
            del all_predicate_ids

        # load subgraph into tensor
        indices, relation_mask = adj(adjacencies, len(s_entity_ids),
                                     len(s_predicate_ids))
        #         print("%d triples" % len(indices))

        # lookup local scores to activate respective entities & predicates
        e_scores = look_up(e_table, s_entity_ids, e_scores)
        p_scores = look_up(p_table, s_predicate_ids, p_scores)
        del p_table, s_predicate_ids, e_table, adjacencies

        # clean up
        gc.collect()
        torch.cuda.empty_cache()

        return (indices, e_scores, p_scores, relation_mask), s_entity_ids