예제 #1
0
파일: __init__.py 프로젝트: silkecreten/pie
        def lemmatization_stream() -> Iterator[str]:
            lower = request.args.get("lower", False)
            if lower:
                lower = True

            if request.method == "GET":
                data = request.args.get("data")
            else:
                data = request.form.get("data")

            if not data:
                yield ""
            else:
                header = False
                for chunk in chunks(data_iterator(data, lower=lower), size=BATCH):
                    sents, lengths = zip(*chunk)

                    tagged, tasks = tagger.tag(sents=sents, lengths=lengths)
                    formatter = formatter_class(tasks)
                    sep = "\t"
                    for sent in tagged:
                        if not header:
                            yield sep.join(formatter.format_headers()) + '\r\n'
                            header = True
                        for token, tags in sent:
                            yield sep.join(formatter.format_line(token, tags)) + '\r\n'
예제 #2
0
    def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype,
                       no_tokenizer: bool = False) -> Generator[Dict[str, str], None, None]:
        # Reset at each document
        processor.reset()
        iterator.tokenizer.reset()
        # Iterate !
        for chunk in utils.chunks(
                iterator(data, lower=self.lower, no_tokenizer=no_tokenizer),
                size=self.batch_size):

            # Unzip the batch into the sentences, their sizes and the dictionaries of things that needs
            #  to be reinserted
            sents, lengths, needs_reinsertion = zip(*chunk)
            is_empty = [not bool(sent) for sent in sents]

            tagged, tasks = self.tag(
                sents=[sent for sent in sents if sent],
                lengths=[l for l in lengths if l != 0]
            )

            if not processor.task_init:
                processor.set_tasks(tasks)

            # We keep a real sentence index
            for sents_index, sent_is_empty in enumerate(is_empty):
                if sent_is_empty:
                    sent = []
                else:
                    sent = tagged.pop(0)

                # Gets things that needs to be reinserted
                sent_reinsertion = needs_reinsertion[sents_index]

                # If we have a disambiguator, we run the results into it
                if self.disambiguation and sent:
                    sent = self.disambiguation(sent, tasks)

                reinsertion_index = 0

                for index, (token, tags) in enumerate(sent):
                    # Before current index
                    while reinsertion_index + index in sent_reinsertion:
                        yield processor.reinsert(sent_reinsertion[reinsertion_index+index])
                        del sent_reinsertion[reinsertion_index + index]
                        reinsertion_index += 1

                    yield from processor.get_dict(token, tags)

                for reinsertion in sorted(list(sent_reinsertion.keys())):
                    yield processor.reinsert(sent_reinsertion[reinsertion])
예제 #3
0
    def prepare_buffer(self, buf, **kwargs):
        "Transform buffer into batch generator"

        def key(data):
            inp, tasks = data
            return len(inp)

        if self.minimize_pad:
            buf = sorted(buf, key=key, reverse=True)
        batches = list(utils.chunks(buf, self.batch_size))

        if self.shuffle:
            random.shuffle(batches)

        for batch in batches:
            yield self.pack_batch(batch, **kwargs)
예제 #4
0
def lemmatise(path, model_spec):
    """
    lemmatises raw text input, with the given model(s), using Pie.
    :param path: path to folder containing the texts
    :param model_spec: specification of the model(s), in Pie syntax
    :return: a dictionary, with a list for each witness, containing a list for each sentence.
    """
    tagger = Tagger()

    for model, tasks in utils.model_spec(model_spec):
        tagger.add_model(model, *tasks)
        print(" - model: {}".format(model))
        tasks = tasks or tagger.models[-1][0].label_encoder.tasks
        print(" - tasks: {}".format(", ".join(tasks)))

    # Get files content
    files = glob.glob(path + '/*.txt')
    content = {}
    for f in files:
        wit = os.path.splitext(os.path.split(f)[-1])[0]
        content[wit] = []
        tokenId = 1

        for chunk in utils.chunks(lines_from_file(f), 200):
            sents, lengths = zip(*chunk)
            tagged, tasks = tagger.tag(sents, lengths)
            for sent in tagged:
                new_sent = []
                for t in sent:
                    token_dict = {
                        "form": t[0],
                        "id": "w_" + str(tokenId),
                        "order_id": str(tokenId)
                    }
                    # and now add the different annotations from lemmatiser
                    for index in enumerate(tasks):
                        token_dict[index[1]] = t[1][index[0]]

                    new_sent.append(token_dict)
                    tokenId += 1

                content[wit].append(new_sent)

    return content
예제 #5
0
    def tag_file(self, fpath, sep='\t'):
        _, ext = os.path.splitext(fpath)
        header = False

        with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f:

            for chunk in utils.chunks(lines_from_file(fpath, self.lower),
                                      self.batch_size):
                sents, lengths = zip(*chunk)

                tagged, tasks = self.tag(sents, lengths)

                for sent in tagged:
                    if not header:
                        f.write(sep.join(['token'] + tasks) + '\n')
                        header = True
                    for token, tags in sent:
                        f.write(sep.join([token] + list(tags)) + '\n')

                    f.write('\n')
예제 #6
0
    def iter_tag_token(
        self,
        data: str,
        iterator: DataIterator,
        processor: ProcessorPrototype,
        no_tokenizer: bool = False,
        empty_token_on_sent_break: bool = False
    ) -> Generator[Optional[Dict[str, str]], None, None]:
        """ Reads the string in [DATA] with [ITERATOR] and [PROCESSOR], then returns each token as a dict

        :param data: Textual
        :param iterator: Iterator used to read data
        :param processor: Processor used to post-process data
        :param no_tokenizer: Disable the tokenizer inside the iterator
        :param empty_token_on_sent_break: Returns a None token when going into a new sequence.
        :yield: Token in the form of a dict or, if [empty_token...] is True, a None value when changing "sentences"
        """
        # Reset at each document
        processor.reset()
        iterator.tokenizer.reset()
        # Iterate !
        for chunk in utils.chunks(iterator(data,
                                           lower=self.lower,
                                           no_tokenizer=no_tokenizer),
                                  size=self.batch_size):

            # Unzip the batch into the sentences, their sizes and the dictionaries of things that needs
            #  to be reinserted
            sents, lengths, needs_reinsertion = zip(*chunk)
            is_empty = [not bool(sent) for sent in sents]

            tagged, tasks = self.tag(sents=[sent for sent in sents if sent],
                                     lengths=[l for l in lengths if l != 0])

            if not processor.task_init:
                processor.set_tasks(tasks)

            # We keep a real sentence index
            for sents_index, sent_is_empty in enumerate(is_empty):
                if sent_is_empty:
                    sent = []
                else:
                    sent = tagged.pop(0)

                # Gets things that needs to be reinserted
                sent_reinsertion = needs_reinsertion[sents_index]

                # If we have a disambiguator, we run the results into it
                if self.disambiguation and sent:
                    sent = self.disambiguation(sent, tasks)

                reinsertion_index = 0

                for index, (token, tags) in enumerate(sent):
                    # Before current index
                    while reinsertion_index + index in sent_reinsertion:
                        yield processor.reinsert(
                            sent_reinsertion[reinsertion_index + index])
                        del sent_reinsertion[reinsertion_index + index]
                        reinsertion_index += 1

                    yield from processor.get_dict(token, tags)

                for reinsertion in sorted(list(sent_reinsertion.keys())):
                    yield processor.reinsert(sent_reinsertion[reinsertion])
                if empty_token_on_sent_break:
                    yield None