예제 #1
0
 def segment_text(self, text):
     """Segment the provided text into sentences."""
     self.tokenizer.setText(text)
     is_another = True
     sentences = []
     while is_another:
         u_sentence = Sentence()
         is_another = self.tokenizer.nextSentence(u_sentence)
         if is_another:
             sentences.append(u_sentence.getText())
     return sentences
예제 #2
0
    def _runApp(self, dto, opNotDone):
        text = dto.getText()

        tokenizer = self._model.newTokenizer(self._model.DEFAULT)
        tokenizer.setText(text)
        error = ProcessingError()
        sentence = Sentence()
        sid = 0

        while tokenizer.nextSentence(sentence, error):
            self._model.tag(sentence, self._model.DEFAULT)
            self._model.parse(sentence, self._model.DEFAULT)
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = sentence.getText()

            for w in sentence.words:
                if w.id == 0:
                    continue

                tt = TeproTok()

                tt.setId(w.id)
                tt.setWordForm(w.form)
                tt.setCTAG(w.upostag)
                tt.setMSD(w.xpostag)
                tt.setLemma(w.lemma)
                tt.setHead(w.head)
                tt.setDepRel(w.deprel)

                ttsent.append(tt)
            # end for w

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only TTL
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sentence = Sentence()
            sid += 1
        # end all split sentences.

        return dto
예제 #3
0
def parse(text, sentence_id):
    """Takes a sentence in raw text and produces
	its CoNLL-U annotation by invoking udpipe

	Paratemeters: text - the sentence to be parsed
				  sentence_id - the ID of the sentence

	Output: a UD graph
	"""
    model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe')

    tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED)
    # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT)

    conlluOutput = OutputFormat.newOutputFormat("conllu")

    sentence = Sentence()

    error = ProcessingError()

    tokenizer.setText(text)

    tokenizer.nextSentence(sentence, error)

    model.tag(sentence, model.DEFAULT)

    model.parse(sentence, model.DEFAULT)

    return conlluOutput.writeSentence(sentence).replace(
        '# sent_id = 1', '# sent_id = ' + sentence_id)
    def transform(self, string: str):
        self.tokenizer.setText(string)

        sentences = []
        sentence = Sentence()
        while self.tokenizer.nextSentence(sentence, self.error):
            self.model.tag(sentence, self.model.DEFAULT)
            sentences.append(sentence)
            sentence = Sentence()

        if self.error.occurred():
            raise Exception(self.error.message)

        words = functools.reduce(lambda x, y: x + y, [[(w.lemma, w.upostag)
                                                       for w in s.words]
                                                      for s in sentences], [])
        return [word for word in words if word[1] not in ['<root>']]
예제 #5
0
    def __call__(self, text):
        """Convert input text to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        udpipe_sents = self.model(text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self.get_tokens_with_heads(udpipe_sents)
        if not tokens:
            return Doc(self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            #tags.append(self.vocab.strings.add(token.xpostag or ""))
            tags.append(self.vocab.strings.add(token.feats or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc
예제 #6
0
    def tokenize_tag_parse_tree(self, root):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`."""
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # tokenization (I cannot turn off segmenter, so I need to join the segments)
        self.tokenizer.setText(root.text)
        u_sentence = Sentence()
        is_another = self.tokenizer.nextSentence(u_sentence)
        u_words = u_sentence.words
        n_words = u_words.size() - 1
        if is_another:
            u_sent_cont = Sentence()
            while self.tokenizer.nextSentence(u_sent_cont):
                n_cont = u_sent_cont.words.size() - 1
                for i in range(1, n_cont + 1):
                    u_w = u_sent_cont.words[i]
                    n_words += 1
                    u_w.id = n_words
                    u_words.append(u_w)

        # tagging and parsing
        self.tool.tag(u_sentence, Model.DEFAULT)
        self.tool.parse(u_sentence, Model.DEFAULT)

        # converting UDPipe nodes to Udapi nodes
        heads, nodes = [], [root]
        for i in range(1, u_words.size()):
            u_w = u_words[i]
            node = root.create_child(
                form=u_w.form,
                lemma=u_w.lemma,
                upos=u_w.upostag,
                xpos=u_w.xpostag,
                feats=u_w.feats,
                deprel=u_w.deprel,
            )
            node.misc = u_w.misc
            heads.append(u_w.head)
            nodes.append(node)
        for node in nodes[1:]:
            head = heads.pop(0)
            node.parent = nodes[head]
예제 #7
0
    def _read(self, text: str, input_format: str) -> List[Sentence]:
        """Convert the text to an UDPipe representation.

        text: Input text.
        input_format: Desired input format.
        RETURNS: Processed sentences.
        """
        input_format.setText(text)
        error = ProcessingError()
        sentences = []

        sentence = Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences
예제 #8
0
    def _read(self, text, input_format):
        """Convert the text to a UDPipe representation.

        text (unicode): Input text.
        input_format (unicode): Desired input format.
        RETURNS (list): Processed ufal.udpipe.Sentence-s.
        """
        input_format.setText(text)
        error = ProcessingError()
        sentences = []

        sentence = Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences
예제 #9
0
    def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
        """Tokenize each line from stored lines and store tokens in sentence.

        sentence: UDPipe container for storing tokens.
        """
        try:
            line = next(self.lines)
        except StopIteration:
            return False
        tokens = line.split("\t")
        prev_word = Word()
        for token in tokens:
            word = sentence.addWord(token)
            if re.match(r"\W", token):
                # leave no space after previous token iff current token
                # is non-alphanumeric (i.e. punctuation)
                prev_word.misc = NO_SPACE
            prev_word = word
        return True
예제 #10
0
        def preproc_item(text):
            if pd.isna(text):
                text = ''
            tokenizer.resetDocument()
            try:
                tokenizer.setText(text)
            except TypeError:
                print(row, text)
                1/0
            
            sentence = Sentence()
            error = ProcessingError()
            
            text = ''
            while (tokenizer.nextSentence(sentence, error)):
    
                udpipe_model.tag(sentence, Pipeline.DEFAULT, error)
                #udpipe_model.parse(sentence, Pipeline.DEFAULT, error)

                text += OutputFormat.newConlluOutputFormat().writeSentence(sentence)
                
            return text
예제 #11
0
    def tokenize_tag_parse_tree(self,
                                root,
                                resegment=False,
                                tag=True,
                                parse=True):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.

        If resegment=True, the returned list of Udapi trees may contain multiple trees.
        """
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions).
        self.tokenizer.setText(root.text)
        is_another = True
        u_sentences = []
        while is_another:
            u_sentence = Sentence()
            is_another = self.tokenizer.nextSentence(u_sentence)
            if is_another:
                u_sentences.append(u_sentence)

        # If resegmentation was not required, we need to join the segments.
        if not resegment and len(u_sentences) > 1:
            first_sent = u_sentences[0]
            n_words = first_sent.words.size() - 1
            for other_sent in u_sentences[1:]:
                other_words = other_sent.words.size() - 1
                for i in range(1, other_words + 1):
                    u_w = other_sent.words[i]
                    n_words += 1
                    u_w.id = n_words
                    first_sent.words.append(u_w)
            u_sentences = [first_sent]

        # tagging and parsing
        if tag:
            for u_sentence in u_sentences:
                self.tool.tag(u_sentence, Model.DEFAULT)
                if parse:
                    self.tool.parse(u_sentence, Model.DEFAULT)
        elif parse:
            raise ValueError(
                'Combination parse=True tag=False is not allowed.')

        # converting UDPipe nodes to Udapi nodes
        new_root = root
        trees = []
        for u_sentence in u_sentences:
            if not new_root:
                new_root = Root()
            heads, nodes = [], [new_root]
            u_words = u_sentence.words
            for i in range(1, u_words.size()):
                u_w = u_words[i]
                node = new_root.create_child(
                    form=u_w.form,
                    lemma=u_w.lemma,
                    upos=u_w.upostag,
                    xpos=u_w.xpostag,
                    feats=u_w.feats,
                    deprel=u_w.deprel,
                    misc=u_w.misc,
                )
                if parse:
                    heads.append(u_w.head)
                    nodes.append(node)
            if parse:
                for node in nodes[1:]:
                    head = heads.pop(0)
                    node.parent = nodes[head]
            trees.append(new_root)
            new_root = None
        return trees
예제 #12
0
파일: jobs.py 프로젝트: lang-uk/lang.org.ua
    def execute(job, task):
        assert settings.UDPIPE_MODEL_FILE, "You must set UDPIPE_MODEL_FILE setting to begin"

        from .mongodb import get_db
        from .udpipe_model import Model as UDPipeModel
        from ufal.udpipe import Sentence  # type: ignore

        db = get_db()

        model = UDPipeModel(settings.UDPIPE_MODEL_FILE)
        total_docs = TagWithUDPipeJob.get_total_count(db, job, task)

        feat_categories = Counter()
        poses = Counter()
        feat_values = defaultdict(Counter)

        for i, (corpus,
                article) in enumerate(TagWithUDPipeJob.get_iter(db, job,
                                                                task)):
            update_clause = defaultdict(list)
            article_lemmas = []
            article_postags = []
            article_features = []
            if "nlp" in article:
                for f in ["title", "text"]:
                    if f not in article["nlp"]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find field {f} in the document {article['_id']}"
                        )
                        continue

                    if "tokens" not in article["nlp"][f]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find tokenized version of field {f} in the document {article['_id']}",
                        )
                        continue

                    for s in article["nlp"][f]["tokens"].split("\n"):
                        # ignoring default model tokenizer in order to use whitespace tokenizer

                        tok_sent = Sentence()
                        for w in s.split(" "):
                            tok_sent.addWord(w)

                        sent_lemmas = []
                        sent_postags = []
                        sent_features = []

                        model.tag(tok_sent)

                        for w in tok_sent.words[1:]:
                            poses.update([w.upostag])
                            sent_lemmas.append(w.lemma)
                            # Again, not moving that to a separate function to
                            # reduce number of unnecessary calls
                            try:
                                sent_postags.append(
                                    COMPRESS_UPOS_MAPPING[w.upostag])
                            except KeyError:
                                task.log(
                                    logging.WARNING,
                                    f"Cannot find {w.upostag} in the COMPRESS_UPOS_MAPPING, skipping for now",
                                )
                                sent_postags.append("Z")

                            sent_features.append(compress_features(w.feats))

                            for pair in w.feats.split("|"):
                                if not pair:
                                    continue
                                cat, val = pair.split("=")
                                feat_categories.update([cat])
                                feat_values[cat].update([val])

                        update_clause[f"nlp.{f}.ud_lemmas"].append(
                            " ".join(sent_lemmas))

                        # We don't need to have a separator for the postags as there is always one
                        # pos tag (which is character) per word
                        update_clause[f"nlp.{f}.ud_postags"].append(
                            "".join(sent_postags))
                        update_clause[f"nlp.{f}.ud_features"].append(
                            " ".join(sent_features))

                for k, v in update_clause.items():
                    update_clause[k] = "\n".join(v)

                if update_clause:
                    try:
                        db[corpus].update_one(
                            {"_id": article["_id"]},
                            {
                                "$set": update_clause,
                                "$addToSet": {
                                    "processing_status": "udpipe_tagged"
                                },
                            },
                        )
                    except pymongo.errors.WriteError:
                        task.log(
                            logging.WARNING,
                            f"Cannot store results back to the document {article['_id']}"
                        )
                        continue
                else:
                    task.log(
                        logging.WARNING,
                        f"Cannot find any text in the document {article['_id']}"
                    )

            task.set_progress((i + 1) * 100 // total_docs, step=1)
예제 #13
0
    def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc:
        """Convert input text to a spaCy Doc.

        text: The text to process. It can be presegmented or pretokenized:
            str             : raw text,
            List[str]       : presegmented text,
            List[List[str]] : pretokenized text.
        RETURNS: The spaCy Doc object.
        """
        if not text:
            return Doc(vocab=self.vocab)

        udpipe_sents = self.model(text=text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents)

        words = []
        spaces = []
        pos = []
        tags = []
        morphs = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self._check_aligned(text=text, tokens=tokens)
        if not is_aligned:
            text = ""
            for token in tokens:
                text += token.form
                if NO_SPACE not in token.misc:
                    text += " "
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(token.upostag or "")
            # CoNNL xpostag-s, custom for each UD treebank
            morphs.append(token.feats or "")
            tags.append(token.xpostag or "")
            deps.append(_spacy_dep(token.deprel) or "")
            lemmas.append(token.lemma or "")
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        doc = Doc(
            vocab=self.vocab,
            words=words,
            spaces=spaces,
            pos=pos,
            tags=tags,
            morphs=morphs,
            lemmas=lemmas,
            deps=deps,
            heads=[head + i for i, head in enumerate(heads)],
        )
        return doc
예제 #14
0
    def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc:
        """Convert input text to a spaCy Doc.

        text: The text to process. It can be presegmented or pretokenized:
            str             : raw text,
            List[str]       : presegmented text,
            List[List[str]] : pretokenized text.
        RETURNS: The spaCy Doc object.
        """
        udpipe_sents = self.model(text=text) if text else [Sentence()]
        text = " ".join(s.getText() for s in udpipe_sents)
        tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents)
        if not tokens:
            return Doc(vocab=self.vocab)

        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self._check_aligned(text=text, tokens=tokens)
        if not is_aligned:
            text = ""
            for token in tokens:
                text += token.form
                if NO_SPACE not in token.misc:
                    text += " "
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not span:
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.form)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upostag or ""))
            # CoNNL xpostag-s, custom for each UD treebank
            tags.append(self.vocab.strings.add(token.xpostag or ""))
            deps.append(self.vocab.strings.add(self._dep(token.deprel) or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.form)
            span = text[offset:]
            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.form))
        try:
            attrs = [POS, TAG, DEP, HEAD]
            array = numpy.array(list(zip(pos, tags, deps, heads)),
                                dtype="uint64")
            doc = Doc(self.vocab, words=words,
                      spaces=spaces).from_array(attrs, array)
        except ValueError as e:
            if '[E167]' in str(e):
                raise ValueError(
                    "Could not properly assign morphology features. "
                    f"Please update the tag map for '{self.model._lang}'"
                    " language. See "
                    "https://spacy.io/usage/adding-languages#tag-map "
                    "for details. A quick workaround is to use the keyword "
                    "argument ignore_tag_map=True when loading UDPipeLanguage."
                )
            else:
                raise e
        # Overwrite lemmas separately to prevent overwritting by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array(attrs=[LEMMA], array=lemma_array)
        doc.is_tagged = bool(any(pos) and any(tags))
        doc.is_parsed = bool(any(deps))
        return doc