Exemplo n.º 1
0
 def _knwnEntity_candidate_stats(self, mentions):
     ''' Get candidate stats for mentions from pre-cached candidate dict
       But method is generic, can/should be used whenever candiidates as read as
       candidates are desired in training i.e.
       Candidates: [true_candidx, cand1_idx, cand2_idx, ..., unk_wid_idx]
       cprobs:     [cprobs]
     '''
     num_mentions = len(mentions)
     recallAt30 = 0
     recallAt1 = 0
     numCands = 0
     noCandsForMention = 0
     for m in mentions:
         nocand = True
         true_wid_idx = self.knwid2idx[m.wid]
         surface = utils._getLnrm(m.surface)
         key = (surface, m.wid)
         (candidates, cprobs) = self.trval_cands_dict[key]
         (r1, r30, numC, noC) = utils.getCandStatsForMention(cprobs)
         recallAt1 += r1
         recallAt30 += r30
         numCands += numC
         noCandsForMention += noC
     return (num_mentions, recallAt1, recallAt30, noCandsForMention,
             numCands)
    def _getCandidatesForSurfaceWid(self, surface, wid):
        # Though all training mentions true wid is in known set,
        # checking here again makes this function versatile to other mentions

        # First candidate is the true entity
        if wid in self.knwid2idx:
            candidates = [self.knwid2idx[wid]]
        else:
            candidates = [self.knwid2idx["<unk_wid>"]]
        cprobs = [0.0]
        # Adding candidates for complete surface
        surfacelnrm = utils._getLnrm(surface)
        if surfacelnrm in self.crosswikis_dict:
            # Get list of (c_wid,cprobs)
            (wids, probs) = self.crosswikis_dict[surfacelnrm]
            for (c, p) in zip(wids, probs):
                if c == wid:
                    cprobs[0] = p
                elif c in self.knwid2idx:
                    candidates.append(self.knwid2idx[c])
                    cprobs.append(p)
        '''
        # If candidates not complete
        if len(candidates) < self.numc:
            surface_tokens = surface.split(" ")
            token_num = 0
            # For each token find and add candidates
            while (len(candidates) < self.numc and
                   token_num < len(surface_tokens)):
                tokenlnrm = utils._getLnrm(surface_tokens[token_num])
                if tokenlnrm in self.crosswikis_dict:
                    # List of (c,cprobs)
                    c_cprobs = self.crosswikis_dict[tokenlnrm]
                    for (c,p) in c_cprobs:
                        if (c != wid and c in self.knwid2idx and
                                self.knwid2idx[c] not in candidates):

                            candidates.append(self.knwid2idx[c])
                            cprobs.append(p)
                #token
                token_num += 1
            #alltokensend
        '''
        candidates = candidates[0:self.numc]
        cprobs = cprobs[0:self.numc]
        assert len(candidates) == len(cprobs)
        num_cands = len(candidates)
        remain = self.numc - num_cands
        candidates.extend([self.knwid2idx["<unk_wid>"]] * remain)
        cprobs.extend([0.0] * remain)

        assert len(candidates) == self.numc
        assert len(cprobs) == self.numc
        assert candidates[0] == self.knwid2idx[wid]
        for i in range(1, len(candidates)):
            assert candidates[i] != self.knwid2idx[wid]
            assert candidates[i] < len(self.idx2knwid)

        return (candidates, cprobs)
 def _addCandidatesForMentions(self, mentions, cwiki_dict):
     for m in mentions:
         assert m.wid in self.knwid2idx, "Wid not in knwid2idx!!!"
         key = (utils._getLnrm(m.surface), m.wid)
         if key not in cwiki_dict:
             (candidates,
              cprobs) = self._getCandidatesForSurfaceWid(m.surface, m.wid)
             cwiki_dict[key] = (candidates, cprobs)
Exemplo n.º 4
0
    def get_candidates(self, mention):
        candidates = []
        # Fill num_cands now
        surface = utils._getLnrm(mention.surface)
        if surface in self.crosswikis_dict:
            cands = self.crosswikis_dict[surface][0:self.num_cands]
            for c in cands:
                candidates.append(c[0])

        return candidates
Exemplo n.º 5
0
 def get_fuzzy_candidates(self, mention):
     candidates = []
     surface_tokens = mention.surface.split(" ")
     surfacelnrm = utils._getLnrm(mention.surface)
     if surfacelnrm in self.crosswikis_dict:
         cands = self.crosswikis_dict[surfacelnrm][0:self.num_cands]
         for c in cands:
             candidates.append(c[0])
     token_num = 0
     extra_cands = set()
     while len(extra_cands) < (self.num_cands - len(candidates)
                               ) and token_num < len(surface_tokens):
         surfacetoken = utils._getLnrm(surface_tokens[token_num])
         if surfacetoken in self.crosswikis_dict:
             cands = self.crosswikis_dict[surfacetoken][0:(self.num_cands -
                                                           len(candidates))]
             for c in cands:
                 extra_cands.add(c[0])
         token_num += 1
     candidates.extend(list(extra_cands))
     return candidates
 def _addCandidatesForAdditionalMentions(self, mentions, cwiki_dict):
     for m in mentions:
         key = (utils._getLnrm(m.surface), m.wid)
         if key not in cwiki_dict:
             if m.wid in self.knwid2idx:
                 (candidates, cprobs) = self._getCandidatesForSurfaceWid(
                     m.surface, m.wid)
                 cwiki_dict[key] = (candidates, cprobs)
             else:
                 candidates = [self.knwid2idx["<unk_wid>"]] * self.numc
                 cprobs = [0.0] * self.numc
                 cwiki_dict[key] = (candidates, cprobs)
Exemplo n.º 7
0
    def make_candidates_cprobs(self, m):
        # Fill num_cands now
        surface = utils._getLnrm(m.surface)
        if surface in self.crosswikis:
            # Pruned crosswikis has only known wids and 30 cands at max
            candwids_cprobs = self.crosswikis[surface][0:self.num_cands - 1]
            (wids, wid_cprobs) = candwids_cprobs
            wid_idxs = [self.knwid2idx[wid] for wid in wids]

        # All possible candidates added now. Pad with unks
        assert len(wid_idxs) == len(wid_cprobs)
        remain = self.num_cands - len(wid_idxs)
        wid_idxs.extend([0] * remain)
        wid_cprobs.extend([0.0] * remain)

        return (wid_idxs, wid_cprobs)
Exemplo n.º 8
0
    def make_candidates_cprobs(self, m):
        # Fill num_cands now
        surface = utils._getLnrm(m.surface)
        if surface in self.crosswikis:
            # Pruned crosswikis has only known wids and 30 cands at max
            candwids_cprobs = self.crosswikis[surface][0:self.num_cands - 1]
            (wids, wid_cprobs) = candwids_cprobs
            wid_idxs = [self.knwid2idx[wid] for wid in wids]
        else:
            wid_idxs = []
            wid_cprobs = []

        wid_idxs.extend([0] * (self.num_cands - len(wid_idxs)))
        wid_cprobs.extend([0.0] * (self.num_cands - len(wid_cprobs)))

        return (wid_idxs, wid_cprobs)
Exemplo n.º 9
0
    def make_candidates_cprobs(self, m):
        # First wid_idx is true entity
        #if self.useKnownEntitesOnly:
        if m.wid in self.knwid2idx:
            wid_idxs = [self.knwid2idx[m.wid]]
        else:
            wid_idxs = [self.knwid2idx[self.unk_wid]]
        # else:
        #     ''' Todo: Set wids_idxs[0] in a way to incorporate all entities'''
        #     wids_idxs = [0]

        # This prob will be updated when going over cwikis candidates
        wid_cprobs = [0.0]

        # Crosswikis to use based on Known / All entities
        # if self.useKnownEntitesOnly:
        cwiki_dict = self.crosswikis
        # else:
        #    cwiki_dict = self.test_all_cwikis

        # Indexing dict to use
        # Todo: When changed to all entities, indexing will change
        wid2idx = self.knwid2idx

        # Fill num_cands now
        surface = utils._getLnrm(m.surface)
        if surface in cwiki_dict:
            candwids_cprobs = cwiki_dict[surface][0:self.num_cands-1]
            (candwids, candwid_cprobs) = candwids_cprobs
            for (c, p) in zip(candwids, candwid_cprobs):
                if c in wid2idx:
                    if c == m.wid:  # Update cprob for true if in known set
                        wid_cprobs[0] = p
                    else:
                        wid_idxs.append(wid2idx[c])
                        wid_cprobs.append(p)
        # All possible candidates added now. Pad with unks
        assert len(wid_idxs) == len(wid_cprobs)
        remain = self.num_cands - len(wid_idxs)
        wid_idxs.extend([0]*remain)
        wid_cprobs.extend([0.0]*remain)

        wid_idxs = wid_idxs[0:self.num_cands]
        wid_cprobs = wid_cprobs[0:self.num_cands]

        return (wid_idxs, wid_cprobs)
Exemplo n.º 10
0
    def incrementCrossWikis(self, mentions):
        for m in mentions:
            wid = m.wid
            known = False
            if wid in self.knwid2idx:
                known = True
            surface = utils._getLnrm(m.surface)

            if surface in self.crosswikis_dict:
                c_cprobs = self.crosswikis_dict[surface]  # [(c,p)]
                # For All CWIKI : Add the c_cprobs as it is
                if surface not in self.test_all_cwiki:
                    self.test_all_cwiki[surface] = c_cprobs
                # For Kwn CWIKI : Prune c_cprobs, cond: c is in kwn entities
                if surface not in self.test_kwn_cwiki:
                    kwncands_cprobs = []
                    for (c, p) in c_cprobs:
                        if c in self.knwid2idx:
                            kwncands_cprobs.append((c, p))
                    if len(kwncands_cprobs) > 0:
                        self.test_kwn_cwiki[surface] = kwncands_cprobs
    def make_candidates_cprobs(self, m, useKnownEntitesOnly):
        # First wid_idx is true entity
        candidates = [m.wid]
        cprobs = [0.0]
        # Crosswikis to use based on Known / All entities
        if useKnownEntitesOnly:
            cwiki_dict = self.test_knwen_cwikis
        else:
            cwiki_dict = self.test_allen_cwikis

        # Fill num_cands now
        surface = utils._getLnrm(m.surface)
        if surface in cwiki_dict:
            candwids_cprobs = cwiki_dict[surface][0:self.num_cands]
            for (c, p) in candwids_cprobs:
                if c == m.wid:  # Update cprob for true if in known set
                    cprobs[0] = p
                else:
                    candidates.append(c)
                    cprobs.append(p)

        return (candidates, cprobs)
    def _getCandidatesForSurfaceWid(self, surface, wid):
        # Though all training mentions true wid is in known set,
        # checking here again makes this function versatile to other mentions

        # First candidate is the true entity
        if wid in self.knwid2idx:
            candidates = [self.knwid2idx[wid]]
        else:
            candidates = [self.knwid2idx["<unk_wid>"]]
        cprobs = [0.0]
        # Adding candidates for complete surface
        surfacelnrm = utils._getLnrm(surface)
        if surfacelnrm in self.crosswikis_dict:
            # Get list of (c_wid, cprobs)
            (wids, probs) = self.crosswikis_dict[surfacelnrm]
            for (c, p) in zip(wids, probs):
                if c == wid:
                    cprobs[0] = p
                elif c in self.knwid2idx:
                    candidates.append(self.knwid2idx[c])
                    cprobs.append(p)

        candidates = candidates[0:self.numc]
        cprobs = cprobs[0:self.numc]
        assert len(candidates) == len(cprobs)
        num_cands = len(candidates)
        remain = self.numc - num_cands
        candidates.extend([self.knwid2idx["<unk_wid>"]] * remain)
        cprobs.extend([0.0] * remain)

        assert len(candidates) == self.numc
        assert len(cprobs) == self.numc
        assert candidates[0] == self.knwid2idx[wid]
        for i in range(1, len(candidates)):
            assert candidates[i] != self.knwid2idx[wid]
            assert candidates[i] < len(self.idx2knwid)

        return (candidates, cprobs)
Exemplo n.º 13
0
    def _next_batch(self, data_type):
        ''' Data : wikititle \t mid \t wid \t start \t end \t tokens \t labels
        start and end are inclusive
        '''
        # Sentence     = s1 ... m1 ... mN, ... sN.
        # Left Batch   = s1 ... m1 ... mN
        # Right Batch  = sN ... mN ... m1
        (left_batch, right_batch) = ([], [])

        # Labels : Vector of 0s and 1s of size = number of labels = 113
        labels_batch = np.zeros([self.batch_size, self.num_labels])

        # Indices: In [B, CohStrs] matrix, Values: 1.0 for len(indices)
        coh_indices = []
        coh_values = []
        coh_matshape = [self.batch_size, self.num_cohstr]

        # Wiki Description: [B, N=100, D=300]
        truewid_descvec_batch = []

        # Candidate WID idxs and their cprobs
        # First element is always true wid
        (wid_idxs_batch, wid_cprobs_batch) = ([], [])

        while len(left_batch) < self.batch_size:
            batch_el = len(left_batch)
            m = self._read_mention(data_type=data_type)

            if m.wid not in self.knwid2idx:
                continue

            # Mention Types
            for label in m.types:
                if label in self.label2idx:
                    labelidx = self.label2idx[label]
                    labels_batch[batch_el][labelidx] = 1.0

            # Document Context Batch
            cohidxs = []
            cohvals = []
            cohFound = False
            for cohstr in m.coherence:
                r = random.random()
                if cohstr in self.cohG92idx and r < self.cohDropoutKeep:
                    cohidx = self.cohG92idx[cohstr]
                    cohidxs.append([batch_el, cohidx])
                    cohvals.append(1.0)
                    cohFound = True
            if cohFound:
                coh_indices.extend(cohidxs)
                coh_values.extend(cohvals)
            else:
                cohidx = self.cohG92idx[self.unk_word]
                coh_indices.append([batch_el, cohidx])
                coh_values.append(1.0)

            # Left and Right context
            if self.strict_context:    # Strict Context
                left_tokens = m.sent_tokens[0:m.start_token]
                right_tokens = m.sent_tokens[m.end_token+1:][::-1]
            else:    # Context inclusive of mention surface
                left_tokens = m.sent_tokens[0:m.end_token+1]
                right_tokens = m.sent_tokens[m.start_token:][::-1]

            # Word Dropout
            left_tokens = self.wordDropout(left_tokens, self.wordDropoutKeep)
            right_tokens = self.wordDropout(right_tokens, self.wordDropoutKeep)

            left_idxs = [self.convert_word2idx(word)
                         for word in left_tokens]
            right_idxs = [self.convert_word2idx(word)
                          for word in right_tokens]
            # else:
            #     left_idxs = left_tokens
            #     right_idxs = right_tokens

            left_batch.append(left_idxs)
            right_batch.append(right_idxs)

            # Entity Description
            if m.wid in self.knwid2idx:
                truewid_descvec_batch.append(
                    self.kwnwid2descvecs[m.wid])
            else:
                truewid_descvec_batch.append(
                    self.kwnwid2descvecs[self.unk_wid])

            # Candidate WID_Idxs and Prior Probabilities
            cands_dict_key = (utils._getLnrm(m.surface),
                              m.wid)
            (wid_idxs, wid_cprobs) = self.test_cands_dict[cands_dict_key]

            wid_idxs_batch.append(wid_idxs)
            wid_cprobs_batch.append(wid_cprobs)

        coherence_batch = (coh_indices, coh_values,
                           coh_matshape)

        return (left_batch, right_batch, truewid_descvec_batch, labels_batch,
                coherence_batch, wid_idxs_batch, wid_cprobs_batch)
Exemplo n.º 14
0
    def _next_batch(self, data_type):
        ''' Data : wikititle \t mid \t wid \t start \t end \t tokens \t labels
        start and end are inclusive
        '''
        # Sentence     = s1 ... m1 ... mN, ... sN.
        # Left Batch   = s1 ... m1 ... mN
        # Right Batch  = sN ... mN ... m1
        (left_batch, right_batch) = ([], [])

        # For each mention, list of its tokens. Average token embeddings for repr.
        mention_embed_batch = []

        # Labels : Vector of 0s and 1s of size = number of labels = 113
        labels_batch = np.zeros([self.batch_size, self.num_labels])

        # Indices: In [B, CohStrs] matrix, Values: 1.0 for len(indices)
        coh_indices = []
        coh_values = []
        if self.coherence:
            coh_matshape = [self.batch_size, self.num_cohstr]
        else:
            coh_matshape = []

        # Wiki Description: [B, N=100, D=300]
        #truewid_descvec_batch = []

        # Candidate WID idxs and their cprobs
        # First element is always true wid
        (wid_idxs_batch, wid_cprobs_batch) = ([], [])

        # As these are cold entities, the first element in wid_idxs_batch will
        # correspond to unk_wid. We need to explicity pass the correct wid for
        # description and types.
        wids_batch = []

        while len(left_batch) < self.batch_size:
            batch_el = len(left_batch)
            m = self._read_mention(data_type=data_type)

            wids_batch.append(m.wid)

            start = m.start_token
            end = m.end_token

            # Labels
            for label in m.types:
                if label in self.label2idx:
                    labelidx = self.label2idx[label]
                    labels_batch[batch_el][labelidx] = 1.0
            #labels

            # Coherence Batch
            cohFound = False  # If no coherence mention is found, then add unk
            if self.coherence:
                cohidxs = []  # Indexes in the [B, NumCoh] matrix
                cohvals = []  # 1.0 to indicate presence
                for cohstr in m.coherence:
                    r = random.random()
                    if cohstr in self.cohG92idx and r < self.cohDropoutKeep:
                        cohidx = self.cohG92idx[cohstr]
                        cohidxs.append([batch_el, cohidx])
                        cohvals.append(1.0)
                        cohFound = True
                if cohFound:
                    coh_indices.extend(cohidxs)
                    coh_values.extend(cohvals)
                else:
                    cohidx = self.cohG92idx[self.unk_word]
                    coh_indices.append([batch_el, cohidx])
                    coh_values.append(1.0)

            # Left and Right context
            if self.strict_context:  # Strict Context
                left_tokens = m.sent_tokens[0:m.start_token]
                right_tokens = m.sent_tokens[m.end_token + 1:][::-1]
            else:  # Context inclusive of mention surface
                left_tokens = m.sent_tokens[0:m.end_token + 1]
                right_tokens = m.sent_tokens[m.start_token:][::-1]

            # Word Dropout
            left_tokens = self.wordDropout(left_tokens, self.wordDropoutKeep)
            right_tokens = self.wordDropout(right_tokens, self.wordDropoutKeep)

            if not self.pretrain_wordembed:
                left_idxs = [
                    self.convert_word2idx(word) for word in left_tokens
                ]
                right_idxs = [
                    self.convert_word2idx(word) for word in right_tokens
                ]
            else:
                left_idxs = left_tokens
                right_idxs = right_tokens

            left_batch.append(left_idxs)
            right_batch.append(right_idxs)
            ''' # Mention Tokens Batch
            mention_tokens = m.surface.split(" ")
            mention_tokens = self.wordDropout(mention_tokens, self.wordDropoutKeep)
            mention_embed_batch.append(mention_tokens)
            '''

            # Wiki Description Batch
            # Each self.kwnwid2descvecs[m.wid] - List of N=100 vectors (size=300)

            #if m.wid in self.knwid2idx:
            #  truewid_descvec_batch.append(self.kwnwid2descvecs[m.wid])
            #else:
            #  truewid_descvec_batch.append(self.kwnwid2descvecs[self.unk_wid])

            # Candidate WID_Idxs and Prior Probabilities
            cands_dict_key = (utils._getLnrm(m.surface), m.wid)
            try:
                (wid_idxs, wid_cprobs) = self.trval_cands_dict[cands_dict_key]
            except:
                wid_idxs = [self.knwid2idx[self.unk_wid]] * self.num_cands
                wid_cprobs = [0.0] * self.num_cands

            wid_idxs_batch.append(wid_idxs)
            wid_cprobs_batch.append(wid_cprobs)

            #self.print_test_batch(m, wid_idxs, wid_cprobs)
        #end batch making

        coherence_batch = (coh_indices, coh_values, coh_matshape)

        return (left_batch, right_batch, wids_batch, labels_batch,
                coherence_batch, wid_idxs_batch, wid_cprobs_batch)