def to_list_repr_both(self, entity, document): covered_tokens = document.cover_index[entity.uid] if len(covered_tokens) > self.maxlen: covered_tokens = covered_tokens[:self.maxlen] context_left = [] context_right = [] else: cl_len = int(math.floor((self.maxlen - len(covered_tokens)) / 2)) cr_len = self.maxlen - len(covered_tokens) - cl_len context_left = document.tokens[max(0, covered_tokens[0].tid - cl_len):covered_tokens[0].tid] context_right = document.tokens[covered_tokens[-1].tid + 1:min( len(document.tokens), covered_tokens[-1].tid + 1 + cr_len)] # pad context_left = [reader.Token(-1, -1, "#BEGIN_OF_TEXT#") ] * (cl_len - len(context_left)) + context_left context_right = context_right + [ reader.Token(-1, -1, "#END_OF_TEXT#") ] * (cr_len - len(context_right)) left = [t.string for t in context_left] right = [t.string for t in context_right] covered = [t.string for t in covered_tokens] words = left + covered + right words_indexes = [] for w in words: if w.lower() in self.word_index: words_indexes.append(self.word_index[w.lower()]) else: words_indexes.append("-1") if len(words_indexes) != self.maxlen: raise ValueError("Only", len(words_indexes), "words, but should be", self.maxlen) return words_indexes
def segment_text(text): sentence_id = 0 token_id = 0 tail = text accumulator = 0 sentences = [sentence for sentence in SentenceSplitter().split(text)] sentence_object_array = [] for sentence in sentences: escaped_sentence = re.escape(sentence) sentence_occurrence = re.search(escaped_sentence, tail) s_start, s_end = sentence_occurrence.span() sentence_start = accumulator + s_start sentence_end = accumulator + s_end tokens = [word for word in word_tokenize(sentence)] token_object_array = [] tail_for_token_search = sentence token_accumulator = 0 for token in tokens: escaped_token = re.escape(token) token_occurrence = re.search(escaped_token, tail_for_token_search) t_start, t_end = token_occurrence.span() # global offsets token_start = sentence_start + token_accumulator + t_start token_end = sentence_start + token_accumulator + t_end token_accumulator += t_end token_object = reader.Token(token_start, token_end, utf8ify(token), token_id) token_object_array.append(token_object) # keep searching in the rest tail_for_token_search = tail_for_token_search[t_end:] token_id += 1 sentence_object = reader.Sentence(sentence_start, sentence_end, token_object_array, utf8ify(sentence), sentence_id) sentence_object_array.append(sentence_object) for tok in sentence_object.token_array: tok.sentence = sentence_object accumulator += s_end # keep rest of text for searching tail = tail[s_end:] sentence_id += 1 return sentence_object_array
def to_list_repr(self, entity, document): covered_tokens = document.cover_index[entity.uid] words = document.tokens[covered_tokens[0].tid:][:self.maxlen] words = words + [reader.Token(-1, -1, "#END_OF_TEXT#") ] * (self.maxlen - len(words)) words = [t.string for t in words] words_indexes = [] for w in words: if w.lower() in self.word_index: words_indexes.append(self.word_index[w.lower()]) else: words_indexes.append("-1") if len(words_indexes) != self.maxlen: raise ValueError("Only", len(words_indexes), "words, but should be", self.maxlen) return words_indexes
def to_repr(self, entity, document): covered_tokens = document.cover_index[entity.uid] domain = document.domain if self.sentence_boundaries: span = covered_tokens[0].sentence first_token = (span.token_array[0]).tid last_token = (span.token_array[-1]).tid else: span = reader.Token(document.tokens[0].start, document.tokens[-1].end, "") first_token = 0 last_token = len(document.tokens) - 1 left_min_index = max(first_token, covered_tokens[0].tid - self.window) left_max_index = covered_tokens[0].tid if left_max_index <= left_min_index: context_left = [] else: context_left = document.tokens[left_min_index:left_max_index] right_min_index = covered_tokens[-1].tid + 1 right_max_index = min(last_token, covered_tokens[-1].tid + self.window + 1) if right_min_index >= right_max_index: context_right = [] else: context_right = document.tokens[right_min_index:right_max_index] cl = len(context_left) cr = len(context_right) K = self.vsm.dim context_left = [ reader.Token(span.start - 1, span.start - 1, "#BEGIN_OF_SENTENCE#") ] * (self.window - cl) + context_left context_right = context_right + [ reader.Token(span.end + 1, span.end + 1, "#END_OF_SENTENCE#") ] * (self.window - cr) # take average embedding as representation covered_emb = np.mean( [self.vsm.get(t.string, domain) for t in covered_tokens], axis=0) # take concatenated embedding as representation # keep only the first m tokens: improve upon this m = 4 if len(covered_tokens) > m: # # simple heuristic: kick out short words for t in covered_tokens: if len(t.string) <= 3: covered_tokens.remove(t) if len(covered_tokens) <= m: break # covered_tokens = filter(lambda x: len(t.string)>3,covered_tokens) my_center = np.concatenate( [self.vsm.get(t.string, domain) for t in covered_tokens]) covered_emb = sequence.pad_sequences([my_center], m * K, truncating="post", dtype="float32")[0] context_left_emb = np.concatenate( [self.vsm.get(t.string, domain) for t in context_left]) context_right_emb = np.concatenate( [self.vsm.get(t.string, domain) for t in context_right]) # check if it is alright print([t.string for t in context_left], [t.string for t in covered_tokens], [t.string for t in context_right]) return np.concatenate( (context_left_emb, covered_emb, context_right_emb), axis=0)
def to_repr(self, entity, document): covered_tokens = document.cover_index[entity.uid] if self.sentence_boundaries: span = covered_tokens[0].sentence first_token = (span.token_array[0]).tid last_token = (span.token_array[-1]).tid else: span = reader.Token(document.tokens[0].start, document.tokens[-1].end, "") first_token = 0 last_token = len(document.tokens) - 1 left_min_index = max(first_token, covered_tokens[0].tid - self.window) left_max_index = covered_tokens[0].tid if left_max_index <= left_min_index: context_left = [] else: context_left = document.tokens[left_min_index:left_max_index] right_min_index = covered_tokens[-1].tid + 1 right_max_index = min(last_token, covered_tokens[-1].tid + self.window + 1) if right_min_index >= right_max_index: context_right = [] else: context_right = document.tokens[right_min_index:right_max_index] cl = len(context_left) cr = len(context_right) K = 100 context_left = [reader.Token(span.start - 1, span.start - 1, "") ] * (self.window - cl) + context_left context_right = context_right + [ reader.Token(span.end + 1, span.end + 1, "") ] * (self.window - cr) token_representation_covered = " ".join( [t.string for t in covered_tokens]) token_representation_left = " ".join([t.string for t in context_left]) token_representation_right = " ".join( [t.string for t in context_right]) my_repr = [] # this is a bit stupid # for submission, let's just do the character2index mapping offline for x in token_representation_covered: if x not in self.globalHash: self.globalHash[x] = self.curVal self.curVal += 1 my_repr.append(self.globalHash[x]) my_repr_left = [] for x in token_representation_left: if x not in self.globalHash: self.globalHash[x] = self.curVal self.curVal += 1 my_repr_left.append(self.globalHash[x]) my_repr_right = [] for x in token_representation_right: if x not in self.globalHash: self.globalHash[x] = self.curVal self.curVal += 1 my_repr_right.append(self.globalHash[x]) #print("%s\t%s\t%s"%(token_representation_left, token_representation_covered, token_representation_right)) my_repr = list(sequence.pad_sequences([my_repr], self.M)[0]) my_repr_left = list(sequence.pad_sequences([my_repr_left], self.L)[0]) my_repr_right = list( sequence.pad_sequences([my_repr_right], self.R)[0]) return my_repr_left + my_repr + my_repr_right