示例#1
0
    def to_list_repr_both(self, entity, document):
        covered_tokens = document.cover_index[entity.uid]

        if len(covered_tokens) > self.maxlen:
            covered_tokens = covered_tokens[:self.maxlen]
            context_left = []
            context_right = []
        else:
            cl_len = int(math.floor((self.maxlen - len(covered_tokens)) / 2))
            cr_len = self.maxlen - len(covered_tokens) - cl_len
            context_left = document.tokens[max(0, covered_tokens[0].tid -
                                               cl_len):covered_tokens[0].tid]
            context_right = document.tokens[covered_tokens[-1].tid + 1:min(
                len(document.tokens), covered_tokens[-1].tid + 1 + cr_len)]
            # pad
            context_left = [reader.Token(-1, -1, "#BEGIN_OF_TEXT#")
                            ] * (cl_len - len(context_left)) + context_left
            context_right = context_right + [
                reader.Token(-1, -1, "#END_OF_TEXT#")
            ] * (cr_len - len(context_right))

        left = [t.string for t in context_left]
        right = [t.string for t in context_right]
        covered = [t.string for t in covered_tokens]
        words = left + covered + right

        words_indexes = []
        for w in words:
            if w.lower() in self.word_index:
                words_indexes.append(self.word_index[w.lower()])
            else:
                words_indexes.append("-1")
        if len(words_indexes) != self.maxlen:
            raise ValueError("Only", len(words_indexes),
                             "words, but should be", self.maxlen)
        return words_indexes
示例#2
0
def segment_text(text):
    sentence_id = 0
    token_id = 0
    tail = text
    accumulator = 0
    sentences = [sentence for sentence in SentenceSplitter().split(text)]
    sentence_object_array = []
    for sentence in sentences:
        escaped_sentence = re.escape(sentence)
        sentence_occurrence = re.search(escaped_sentence, tail)
        s_start, s_end = sentence_occurrence.span()
        sentence_start = accumulator + s_start
        sentence_end = accumulator + s_end

        tokens = [word for word in word_tokenize(sentence)]
        token_object_array = []
        tail_for_token_search = sentence
        token_accumulator = 0
        for token in tokens:
            escaped_token = re.escape(token)
            token_occurrence = re.search(escaped_token, tail_for_token_search)
            t_start, t_end = token_occurrence.span()
            # global offsets
            token_start = sentence_start + token_accumulator + t_start
            token_end = sentence_start + token_accumulator + t_end
            token_accumulator += t_end

            token_object = reader.Token(token_start, token_end, utf8ify(token),
                                        token_id)
            token_object_array.append(token_object)
            # keep searching in the rest
            tail_for_token_search = tail_for_token_search[t_end:]
            token_id += 1

        sentence_object = reader.Sentence(sentence_start,
                                          sentence_end, token_object_array,
                                          utf8ify(sentence), sentence_id)
        sentence_object_array.append(sentence_object)
        for tok in sentence_object.token_array:
            tok.sentence = sentence_object

        accumulator += s_end
        # keep rest of text for searching
        tail = tail[s_end:]
        sentence_id += 1

    return sentence_object_array
示例#3
0
    def to_list_repr(self, entity, document):
        covered_tokens = document.cover_index[entity.uid]
        words = document.tokens[covered_tokens[0].tid:][:self.maxlen]
        words = words + [reader.Token(-1, -1, "#END_OF_TEXT#")
                         ] * (self.maxlen - len(words))

        words = [t.string for t in words]

        words_indexes = []
        for w in words:
            if w.lower() in self.word_index:
                words_indexes.append(self.word_index[w.lower()])
            else:
                words_indexes.append("-1")
        if len(words_indexes) != self.maxlen:
            raise ValueError("Only", len(words_indexes),
                             "words, but should be", self.maxlen)
        return words_indexes
示例#4
0
    def to_repr(self, entity, document):
        covered_tokens = document.cover_index[entity.uid]
        domain = document.domain

        if self.sentence_boundaries:
            span = covered_tokens[0].sentence
            first_token = (span.token_array[0]).tid
            last_token = (span.token_array[-1]).tid
        else:
            span = reader.Token(document.tokens[0].start,
                                document.tokens[-1].end, "")
            first_token = 0
            last_token = len(document.tokens) - 1

        left_min_index = max(first_token, covered_tokens[0].tid - self.window)
        left_max_index = covered_tokens[0].tid
        if left_max_index <= left_min_index:
            context_left = []
        else:
            context_left = document.tokens[left_min_index:left_max_index]

        right_min_index = covered_tokens[-1].tid + 1
        right_max_index = min(last_token,
                              covered_tokens[-1].tid + self.window + 1)
        if right_min_index >= right_max_index:
            context_right = []
        else:
            context_right = document.tokens[right_min_index:right_max_index]

        cl = len(context_left)
        cr = len(context_right)
        K = self.vsm.dim
        context_left = [
            reader.Token(span.start - 1, span.start - 1, "#BEGIN_OF_SENTENCE#")
        ] * (self.window - cl) + context_left
        context_right = context_right + [
            reader.Token(span.end + 1, span.end + 1, "#END_OF_SENTENCE#")
        ] * (self.window - cr)

        # take average embedding as representation
        covered_emb = np.mean(
            [self.vsm.get(t.string, domain) for t in covered_tokens], axis=0)
        # take concatenated embedding as representation
        # keep only the first m tokens: improve upon this
        m = 4
        if len(covered_tokens) > m:
            #  # simple heuristic: kick out short words
            for t in covered_tokens:
                if len(t.string) <= 3:
                    covered_tokens.remove(t)
                    if len(covered_tokens) <= m:
                        break
        #  covered_tokens = filter(lambda x: len(t.string)>3,covered_tokens)
        my_center = np.concatenate(
            [self.vsm.get(t.string, domain) for t in covered_tokens])
        covered_emb = sequence.pad_sequences([my_center],
                                             m * K,
                                             truncating="post",
                                             dtype="float32")[0]
        context_left_emb = np.concatenate(
            [self.vsm.get(t.string, domain) for t in context_left])
        context_right_emb = np.concatenate(
            [self.vsm.get(t.string, domain) for t in context_right])
        # check if it is alright
        print([t.string for t in context_left],
              [t.string for t in covered_tokens],
              [t.string for t in context_right])

        return np.concatenate(
            (context_left_emb, covered_emb, context_right_emb), axis=0)
示例#5
0
    def to_repr(self, entity, document):
        covered_tokens = document.cover_index[entity.uid]
        if self.sentence_boundaries:
            span = covered_tokens[0].sentence
            first_token = (span.token_array[0]).tid
            last_token = (span.token_array[-1]).tid
        else:
            span = reader.Token(document.tokens[0].start,
                                document.tokens[-1].end, "")
            first_token = 0
            last_token = len(document.tokens) - 1

        left_min_index = max(first_token, covered_tokens[0].tid - self.window)
        left_max_index = covered_tokens[0].tid
        if left_max_index <= left_min_index:
            context_left = []
        else:
            context_left = document.tokens[left_min_index:left_max_index]

        right_min_index = covered_tokens[-1].tid + 1
        right_max_index = min(last_token,
                              covered_tokens[-1].tid + self.window + 1)
        if right_min_index >= right_max_index:
            context_right = []
        else:
            context_right = document.tokens[right_min_index:right_max_index]

        cl = len(context_left)
        cr = len(context_right)
        K = 100
        context_left = [reader.Token(span.start - 1, span.start - 1, "")
                        ] * (self.window - cl) + context_left
        context_right = context_right + [
            reader.Token(span.end + 1, span.end + 1, "")
        ] * (self.window - cr)

        token_representation_covered = " ".join(
            [t.string for t in covered_tokens])
        token_representation_left = " ".join([t.string for t in context_left])
        token_representation_right = " ".join(
            [t.string for t in context_right])
        my_repr = []
        # this is a bit stupid
        # for submission, let's just do the character2index mapping offline
        for x in token_representation_covered:
            if x not in self.globalHash:
                self.globalHash[x] = self.curVal
                self.curVal += 1
            my_repr.append(self.globalHash[x])
        my_repr_left = []
        for x in token_representation_left:
            if x not in self.globalHash:
                self.globalHash[x] = self.curVal
                self.curVal += 1
            my_repr_left.append(self.globalHash[x])
        my_repr_right = []
        for x in token_representation_right:
            if x not in self.globalHash:
                self.globalHash[x] = self.curVal
                self.curVal += 1
            my_repr_right.append(self.globalHash[x])
        #print("%s\t%s\t%s"%(token_representation_left, token_representation_covered, token_representation_right))
        my_repr = list(sequence.pad_sequences([my_repr], self.M)[0])
        my_repr_left = list(sequence.pad_sequences([my_repr_left], self.L)[0])
        my_repr_right = list(
            sequence.pad_sequences([my_repr_right], self.R)[0])

        return my_repr_left + my_repr + my_repr_right