Exemplo n.º 1
0
    def tokenize(self, text: Text) -> List[Token]:
        import mitie

        encoded_sentence = text.encode('utf-8')
        tokenized = mitie.tokenize_with_offsets(encoded_sentence)
        tokens = [self._token_from_offset(token, offset, encoded_sentence)
                  for token, offset in tokenized]
        return tokens
Exemplo n.º 2
0
    def tokenize(self, text: Text) -> List[Token]:
        import mitie

        encoded_sentence = text.encode('utf-8')
        tokenized = mitie.tokenize_with_offsets(encoded_sentence)
        tokens = [self._token_from_offset(token, offset, encoded_sentence)
                  for token, offset in tokenized]
        return tokens
Exemplo n.º 3
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import mitie

        _text = text.encode('utf-8')
        tokenized = mitie.tokenize_with_offsets(_text)
        tokens = [Token(token.decode('utf-8'), self._byte_to_char_offset(_text, offset)) for token, offset in tokenized]
        return tokens
Exemplo n.º 4
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import mitie

        _text = text.encode('utf-8')
        tokenized = mitie.tokenize_with_offsets(_text)
        tokens = [
            Token(token.decode('utf-8'),
                  self._byte_to_char_offset(_text, offset))
            for token, offset in tokenized
        ]
        return tokens
Exemplo n.º 5
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import mitie

        text = message.get(attribute)

        encoded_sentence = text.encode(DEFAULT_ENCODING)
        tokenized = mitie.tokenize_with_offsets(encoded_sentence)
        tokens = [
            self._token_from_offset(token, offset, encoded_sentence)
            for token, offset in tokenized
        ]

        return self._apply_token_pattern(tokens)
Exemplo n.º 6
0
    def tokenize(self,
                 text: Text,
                 attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
        import mitie

        encoded_sentence = text.encode(DEFAULT_ENCODING)
        tokenized = mitie.tokenize_with_offsets(encoded_sentence)
        tokens = [
            self._token_from_offset(token, offset, encoded_sentence)
            for token, offset in tokenized
        ]

        self.add_cls_token(tokens, attribute)

        return tokens
Exemplo n.º 7
0
    def entities(extracted_text, lang):
        extracted_text = re.sub(r'[^\x00-\x7F]', ' ', extracted_text)
        extracted_text = extracted_text.replace("[:newline:]", "           ")
        extracted_text = extracted_text.encode("ascii")
        #tokens = tokenize(body)
        tokens = tokenize_with_offsets(extracted_text)

        entities_markup = ner_models[lang].extract_entities(tokens)
        #results contains [(tag, entity, offset, score)]
        results = [(tag, " ".join([tokens[i][0] for i in rng]),
                    ",".join([str(tokens[i][1])
                              for i in rng]), "{0:.2f}".format(score))
                   for rng, tag, score in entities_markup]

        entity_doc = {}
        # entity_doc["entity_full"] = results
        entity_doc["entity_all"] = []
        entity_doc["entity_location"] = []
        entity_doc["entity_organization"] = []
        entity_doc["entity_person"] = []
        entity_doc["entity_misc"] = []

        for tag, entity, rng, score in results:
            if len(entity) > 30:
                continue

            entity_doc["entity_all"].append(entity)

            if tag == 'LOCATION' and score > 0.3:
                entity_doc["entity_location"].append(entity)
            elif tag == 'ORGANIZATION' and score > 0.5:
                entity_doc["entity_organization"].append(entity)
            elif tag == 'PERSON' and score > 0.3:
                entity_doc["entity_person"].append(entity)
            elif score > 0.5:
                entity_doc["entity_misc"].append(entity)
        return entity_doc
    def entities(extracted_text, lang):
            extracted_text = re.sub(r'[^\x00-\x7F]',' ', extracted_text)
            extracted_text = extracted_text.replace("[:newline:]", "           ")
            extracted_text = extracted_text.encode("ascii")
            #tokens = tokenize(body)
            tokens = tokenize_with_offsets(extracted_text)

            entities_markup = ner_models[lang].extract_entities(tokens)
            #results contains [(tag, entity, offset, score)]
            results = [
                (tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score))
                for rng, tag, score in entities_markup ]

            entity_doc = {}
            # entity_doc["entity_full"] = results
            entity_doc["entity_all"] = []
            entity_doc["entity_location"] = []
            entity_doc["entity_organization"] = []
            entity_doc["entity_person"] = []
            entity_doc["entity_misc"] = []

            for tag, entity, rng, score in results:
                if len(entity) > 30:
                    continue

                entity_doc["entity_all"].append(entity)

                if tag == 'LOCATION' and score > 0.3:
                    entity_doc["entity_location"].append(entity)
                elif tag == 'ORGANIZATION' and score > 0.5:
                    entity_doc["entity_organization"].append(entity)
                elif tag == 'PERSON' and score > 0.3:
                    entity_doc["entity_person"].append(entity)
                elif score > 0.5:
                    entity_doc["entity_misc"].append(entity)
            return entity_doc