def _knwnEntity_candidate_stats(self, mentions): ''' Get candidate stats for mentions from pre-cached candidate dict But method is generic, can/should be used whenever candiidates as read as candidates are desired in training i.e. Candidates: [true_candidx, cand1_idx, cand2_idx, ..., unk_wid_idx] cprobs: [cprobs] ''' num_mentions = len(mentions) recallAt30 = 0 recallAt1 = 0 numCands = 0 noCandsForMention = 0 for m in mentions: nocand = True true_wid_idx = self.knwid2idx[m.wid] surface = utils._getLnrm(m.surface) key = (surface, m.wid) (candidates, cprobs) = self.trval_cands_dict[key] (r1, r30, numC, noC) = utils.getCandStatsForMention(cprobs) recallAt1 += r1 recallAt30 += r30 numCands += numC noCandsForMention += noC return (num_mentions, recallAt1, recallAt30, noCandsForMention, numCands)
def _getCandidatesForSurfaceWid(self, surface, wid): # Though all training mentions true wid is in known set, # checking here again makes this function versatile to other mentions # First candidate is the true entity if wid in self.knwid2idx: candidates = [self.knwid2idx[wid]] else: candidates = [self.knwid2idx["<unk_wid>"]] cprobs = [0.0] # Adding candidates for complete surface surfacelnrm = utils._getLnrm(surface) if surfacelnrm in self.crosswikis_dict: # Get list of (c_wid,cprobs) (wids, probs) = self.crosswikis_dict[surfacelnrm] for (c, p) in zip(wids, probs): if c == wid: cprobs[0] = p elif c in self.knwid2idx: candidates.append(self.knwid2idx[c]) cprobs.append(p) ''' # If candidates not complete if len(candidates) < self.numc: surface_tokens = surface.split(" ") token_num = 0 # For each token find and add candidates while (len(candidates) < self.numc and token_num < len(surface_tokens)): tokenlnrm = utils._getLnrm(surface_tokens[token_num]) if tokenlnrm in self.crosswikis_dict: # List of (c,cprobs) c_cprobs = self.crosswikis_dict[tokenlnrm] for (c,p) in c_cprobs: if (c != wid and c in self.knwid2idx and self.knwid2idx[c] not in candidates): candidates.append(self.knwid2idx[c]) cprobs.append(p) #token token_num += 1 #alltokensend ''' candidates = candidates[0:self.numc] cprobs = cprobs[0:self.numc] assert len(candidates) == len(cprobs) num_cands = len(candidates) remain = self.numc - num_cands candidates.extend([self.knwid2idx["<unk_wid>"]] * remain) cprobs.extend([0.0] * remain) assert len(candidates) == self.numc assert len(cprobs) == self.numc assert candidates[0] == self.knwid2idx[wid] for i in range(1, len(candidates)): assert candidates[i] != self.knwid2idx[wid] assert candidates[i] < len(self.idx2knwid) return (candidates, cprobs)
def _addCandidatesForMentions(self, mentions, cwiki_dict): for m in mentions: assert m.wid in self.knwid2idx, "Wid not in knwid2idx!!!" key = (utils._getLnrm(m.surface), m.wid) if key not in cwiki_dict: (candidates, cprobs) = self._getCandidatesForSurfaceWid(m.surface, m.wid) cwiki_dict[key] = (candidates, cprobs)
def get_candidates(self, mention): candidates = [] # Fill num_cands now surface = utils._getLnrm(mention.surface) if surface in self.crosswikis_dict: cands = self.crosswikis_dict[surface][0:self.num_cands] for c in cands: candidates.append(c[0]) return candidates
def get_fuzzy_candidates(self, mention): candidates = [] surface_tokens = mention.surface.split(" ") surfacelnrm = utils._getLnrm(mention.surface) if surfacelnrm in self.crosswikis_dict: cands = self.crosswikis_dict[surfacelnrm][0:self.num_cands] for c in cands: candidates.append(c[0]) token_num = 0 extra_cands = set() while len(extra_cands) < (self.num_cands - len(candidates) ) and token_num < len(surface_tokens): surfacetoken = utils._getLnrm(surface_tokens[token_num]) if surfacetoken in self.crosswikis_dict: cands = self.crosswikis_dict[surfacetoken][0:(self.num_cands - len(candidates))] for c in cands: extra_cands.add(c[0]) token_num += 1 candidates.extend(list(extra_cands)) return candidates
def _addCandidatesForAdditionalMentions(self, mentions, cwiki_dict): for m in mentions: key = (utils._getLnrm(m.surface), m.wid) if key not in cwiki_dict: if m.wid in self.knwid2idx: (candidates, cprobs) = self._getCandidatesForSurfaceWid( m.surface, m.wid) cwiki_dict[key] = (candidates, cprobs) else: candidates = [self.knwid2idx["<unk_wid>"]] * self.numc cprobs = [0.0] * self.numc cwiki_dict[key] = (candidates, cprobs)
def make_candidates_cprobs(self, m): # Fill num_cands now surface = utils._getLnrm(m.surface) if surface in self.crosswikis: # Pruned crosswikis has only known wids and 30 cands at max candwids_cprobs = self.crosswikis[surface][0:self.num_cands - 1] (wids, wid_cprobs) = candwids_cprobs wid_idxs = [self.knwid2idx[wid] for wid in wids] # All possible candidates added now. Pad with unks assert len(wid_idxs) == len(wid_cprobs) remain = self.num_cands - len(wid_idxs) wid_idxs.extend([0] * remain) wid_cprobs.extend([0.0] * remain) return (wid_idxs, wid_cprobs)
def make_candidates_cprobs(self, m): # Fill num_cands now surface = utils._getLnrm(m.surface) if surface in self.crosswikis: # Pruned crosswikis has only known wids and 30 cands at max candwids_cprobs = self.crosswikis[surface][0:self.num_cands - 1] (wids, wid_cprobs) = candwids_cprobs wid_idxs = [self.knwid2idx[wid] for wid in wids] else: wid_idxs = [] wid_cprobs = [] wid_idxs.extend([0] * (self.num_cands - len(wid_idxs))) wid_cprobs.extend([0.0] * (self.num_cands - len(wid_cprobs))) return (wid_idxs, wid_cprobs)
def make_candidates_cprobs(self, m): # First wid_idx is true entity #if self.useKnownEntitesOnly: if m.wid in self.knwid2idx: wid_idxs = [self.knwid2idx[m.wid]] else: wid_idxs = [self.knwid2idx[self.unk_wid]] # else: # ''' Todo: Set wids_idxs[0] in a way to incorporate all entities''' # wids_idxs = [0] # This prob will be updated when going over cwikis candidates wid_cprobs = [0.0] # Crosswikis to use based on Known / All entities # if self.useKnownEntitesOnly: cwiki_dict = self.crosswikis # else: # cwiki_dict = self.test_all_cwikis # Indexing dict to use # Todo: When changed to all entities, indexing will change wid2idx = self.knwid2idx # Fill num_cands now surface = utils._getLnrm(m.surface) if surface in cwiki_dict: candwids_cprobs = cwiki_dict[surface][0:self.num_cands-1] (candwids, candwid_cprobs) = candwids_cprobs for (c, p) in zip(candwids, candwid_cprobs): if c in wid2idx: if c == m.wid: # Update cprob for true if in known set wid_cprobs[0] = p else: wid_idxs.append(wid2idx[c]) wid_cprobs.append(p) # All possible candidates added now. Pad with unks assert len(wid_idxs) == len(wid_cprobs) remain = self.num_cands - len(wid_idxs) wid_idxs.extend([0]*remain) wid_cprobs.extend([0.0]*remain) wid_idxs = wid_idxs[0:self.num_cands] wid_cprobs = wid_cprobs[0:self.num_cands] return (wid_idxs, wid_cprobs)
def incrementCrossWikis(self, mentions): for m in mentions: wid = m.wid known = False if wid in self.knwid2idx: known = True surface = utils._getLnrm(m.surface) if surface in self.crosswikis_dict: c_cprobs = self.crosswikis_dict[surface] # [(c,p)] # For All CWIKI : Add the c_cprobs as it is if surface not in self.test_all_cwiki: self.test_all_cwiki[surface] = c_cprobs # For Kwn CWIKI : Prune c_cprobs, cond: c is in kwn entities if surface not in self.test_kwn_cwiki: kwncands_cprobs = [] for (c, p) in c_cprobs: if c in self.knwid2idx: kwncands_cprobs.append((c, p)) if len(kwncands_cprobs) > 0: self.test_kwn_cwiki[surface] = kwncands_cprobs
def make_candidates_cprobs(self, m, useKnownEntitesOnly): # First wid_idx is true entity candidates = [m.wid] cprobs = [0.0] # Crosswikis to use based on Known / All entities if useKnownEntitesOnly: cwiki_dict = self.test_knwen_cwikis else: cwiki_dict = self.test_allen_cwikis # Fill num_cands now surface = utils._getLnrm(m.surface) if surface in cwiki_dict: candwids_cprobs = cwiki_dict[surface][0:self.num_cands] for (c, p) in candwids_cprobs: if c == m.wid: # Update cprob for true if in known set cprobs[0] = p else: candidates.append(c) cprobs.append(p) return (candidates, cprobs)
def _getCandidatesForSurfaceWid(self, surface, wid): # Though all training mentions true wid is in known set, # checking here again makes this function versatile to other mentions # First candidate is the true entity if wid in self.knwid2idx: candidates = [self.knwid2idx[wid]] else: candidates = [self.knwid2idx["<unk_wid>"]] cprobs = [0.0] # Adding candidates for complete surface surfacelnrm = utils._getLnrm(surface) if surfacelnrm in self.crosswikis_dict: # Get list of (c_wid, cprobs) (wids, probs) = self.crosswikis_dict[surfacelnrm] for (c, p) in zip(wids, probs): if c == wid: cprobs[0] = p elif c in self.knwid2idx: candidates.append(self.knwid2idx[c]) cprobs.append(p) candidates = candidates[0:self.numc] cprobs = cprobs[0:self.numc] assert len(candidates) == len(cprobs) num_cands = len(candidates) remain = self.numc - num_cands candidates.extend([self.knwid2idx["<unk_wid>"]] * remain) cprobs.extend([0.0] * remain) assert len(candidates) == self.numc assert len(cprobs) == self.numc assert candidates[0] == self.knwid2idx[wid] for i in range(1, len(candidates)): assert candidates[i] != self.knwid2idx[wid] assert candidates[i] < len(self.idx2knwid) return (candidates, cprobs)
def _next_batch(self, data_type): ''' Data : wikititle \t mid \t wid \t start \t end \t tokens \t labels start and end are inclusive ''' # Sentence = s1 ... m1 ... mN, ... sN. # Left Batch = s1 ... m1 ... mN # Right Batch = sN ... mN ... m1 (left_batch, right_batch) = ([], []) # Labels : Vector of 0s and 1s of size = number of labels = 113 labels_batch = np.zeros([self.batch_size, self.num_labels]) # Indices: In [B, CohStrs] matrix, Values: 1.0 for len(indices) coh_indices = [] coh_values = [] coh_matshape = [self.batch_size, self.num_cohstr] # Wiki Description: [B, N=100, D=300] truewid_descvec_batch = [] # Candidate WID idxs and their cprobs # First element is always true wid (wid_idxs_batch, wid_cprobs_batch) = ([], []) while len(left_batch) < self.batch_size: batch_el = len(left_batch) m = self._read_mention(data_type=data_type) if m.wid not in self.knwid2idx: continue # Mention Types for label in m.types: if label in self.label2idx: labelidx = self.label2idx[label] labels_batch[batch_el][labelidx] = 1.0 # Document Context Batch cohidxs = [] cohvals = [] cohFound = False for cohstr in m.coherence: r = random.random() if cohstr in self.cohG92idx and r < self.cohDropoutKeep: cohidx = self.cohG92idx[cohstr] cohidxs.append([batch_el, cohidx]) cohvals.append(1.0) cohFound = True if cohFound: coh_indices.extend(cohidxs) coh_values.extend(cohvals) else: cohidx = self.cohG92idx[self.unk_word] coh_indices.append([batch_el, cohidx]) coh_values.append(1.0) # Left and Right context if self.strict_context: # Strict Context left_tokens = m.sent_tokens[0:m.start_token] right_tokens = m.sent_tokens[m.end_token+1:][::-1] else: # Context inclusive of mention surface left_tokens = m.sent_tokens[0:m.end_token+1] right_tokens = m.sent_tokens[m.start_token:][::-1] # Word Dropout left_tokens = self.wordDropout(left_tokens, self.wordDropoutKeep) right_tokens = self.wordDropout(right_tokens, self.wordDropoutKeep) left_idxs = [self.convert_word2idx(word) for word in left_tokens] right_idxs = [self.convert_word2idx(word) for word in right_tokens] # else: # left_idxs = left_tokens # right_idxs = right_tokens left_batch.append(left_idxs) right_batch.append(right_idxs) # Entity Description if m.wid in self.knwid2idx: truewid_descvec_batch.append( self.kwnwid2descvecs[m.wid]) else: truewid_descvec_batch.append( self.kwnwid2descvecs[self.unk_wid]) # Candidate WID_Idxs and Prior Probabilities cands_dict_key = (utils._getLnrm(m.surface), m.wid) (wid_idxs, wid_cprobs) = self.test_cands_dict[cands_dict_key] wid_idxs_batch.append(wid_idxs) wid_cprobs_batch.append(wid_cprobs) coherence_batch = (coh_indices, coh_values, coh_matshape) return (left_batch, right_batch, truewid_descvec_batch, labels_batch, coherence_batch, wid_idxs_batch, wid_cprobs_batch)
def _next_batch(self, data_type): ''' Data : wikititle \t mid \t wid \t start \t end \t tokens \t labels start and end are inclusive ''' # Sentence = s1 ... m1 ... mN, ... sN. # Left Batch = s1 ... m1 ... mN # Right Batch = sN ... mN ... m1 (left_batch, right_batch) = ([], []) # For each mention, list of its tokens. Average token embeddings for repr. mention_embed_batch = [] # Labels : Vector of 0s and 1s of size = number of labels = 113 labels_batch = np.zeros([self.batch_size, self.num_labels]) # Indices: In [B, CohStrs] matrix, Values: 1.0 for len(indices) coh_indices = [] coh_values = [] if self.coherence: coh_matshape = [self.batch_size, self.num_cohstr] else: coh_matshape = [] # Wiki Description: [B, N=100, D=300] #truewid_descvec_batch = [] # Candidate WID idxs and their cprobs # First element is always true wid (wid_idxs_batch, wid_cprobs_batch) = ([], []) # As these are cold entities, the first element in wid_idxs_batch will # correspond to unk_wid. We need to explicity pass the correct wid for # description and types. wids_batch = [] while len(left_batch) < self.batch_size: batch_el = len(left_batch) m = self._read_mention(data_type=data_type) wids_batch.append(m.wid) start = m.start_token end = m.end_token # Labels for label in m.types: if label in self.label2idx: labelidx = self.label2idx[label] labels_batch[batch_el][labelidx] = 1.0 #labels # Coherence Batch cohFound = False # If no coherence mention is found, then add unk if self.coherence: cohidxs = [] # Indexes in the [B, NumCoh] matrix cohvals = [] # 1.0 to indicate presence for cohstr in m.coherence: r = random.random() if cohstr in self.cohG92idx and r < self.cohDropoutKeep: cohidx = self.cohG92idx[cohstr] cohidxs.append([batch_el, cohidx]) cohvals.append(1.0) cohFound = True if cohFound: coh_indices.extend(cohidxs) coh_values.extend(cohvals) else: cohidx = self.cohG92idx[self.unk_word] coh_indices.append([batch_el, cohidx]) coh_values.append(1.0) # Left and Right context if self.strict_context: # Strict Context left_tokens = m.sent_tokens[0:m.start_token] right_tokens = m.sent_tokens[m.end_token + 1:][::-1] else: # Context inclusive of mention surface left_tokens = m.sent_tokens[0:m.end_token + 1] right_tokens = m.sent_tokens[m.start_token:][::-1] # Word Dropout left_tokens = self.wordDropout(left_tokens, self.wordDropoutKeep) right_tokens = self.wordDropout(right_tokens, self.wordDropoutKeep) if not self.pretrain_wordembed: left_idxs = [ self.convert_word2idx(word) for word in left_tokens ] right_idxs = [ self.convert_word2idx(word) for word in right_tokens ] else: left_idxs = left_tokens right_idxs = right_tokens left_batch.append(left_idxs) right_batch.append(right_idxs) ''' # Mention Tokens Batch mention_tokens = m.surface.split(" ") mention_tokens = self.wordDropout(mention_tokens, self.wordDropoutKeep) mention_embed_batch.append(mention_tokens) ''' # Wiki Description Batch # Each self.kwnwid2descvecs[m.wid] - List of N=100 vectors (size=300) #if m.wid in self.knwid2idx: # truewid_descvec_batch.append(self.kwnwid2descvecs[m.wid]) #else: # truewid_descvec_batch.append(self.kwnwid2descvecs[self.unk_wid]) # Candidate WID_Idxs and Prior Probabilities cands_dict_key = (utils._getLnrm(m.surface), m.wid) try: (wid_idxs, wid_cprobs) = self.trval_cands_dict[cands_dict_key] except: wid_idxs = [self.knwid2idx[self.unk_wid]] * self.num_cands wid_cprobs = [0.0] * self.num_cands wid_idxs_batch.append(wid_idxs) wid_cprobs_batch.append(wid_cprobs) #self.print_test_batch(m, wid_idxs, wid_cprobs) #end batch making coherence_batch = (coh_indices, coh_values, coh_matshape) return (left_batch, right_batch, wids_batch, labels_batch, coherence_batch, wid_idxs_batch, wid_cprobs_batch)