Exemplo n.º 1
0
    def predict(self, doc: Document, **kwargs) -> Document:

        unpack = False
        if self.input_key:
            if isinstance(self.input_key, (tuple, list)):
                if isinstance(self.component, LambdaComponent):  # assume functions take multiple arguments
                    input = [doc[key] for key in self.input_key]
                    unpack = True
                else:
                    input = list(list(zip(*sent)) for sent in zip(*[doc[key] for key in self.input_key]))
            else:
                input = doc[self.input_key]
        else:
            input = doc

        if self.kwargs:
            kwargs.update(self.kwargs)
        if unpack:
            kwargs['_elit_unpack'] = True
        output = self.component(input, **kwargs)
        if isinstance(output, types.GeneratorType):
            output = list(output)
        if self.output_key:
            if not isinstance(doc, Document):
                doc = Document()
            if isinstance(self.output_key, tuple):
                for key, value in zip(self.output_key, output):
                    doc[key] = value
            else:
                doc[self.output_key] = output
            return doc
        return output
Exemplo n.º 2
0
 def parse(
     self,
     text: Union[str, List[str]] = None,
     tokens: List[List[str]] = None,
     models=("lem", "pos", "ner", "con", "dep", "srl", "amr", "dcr", "ocr"),
     speaker_ids: Union[int, List[int]] = None,
     genre: str = None,
     coref_context: dict = None,
     return_coref_prob: bool = False,
     language='en',
     verbose=True,
 ) -> Document:
     assert text or tokens, 'At least one of text or tokens has to be specified.'
     response = self._send_post_json(
         self.url + '/parse', {
             'text': text,
             'tokens': tokens,
             'models': models,
             'speaker_ids': speaker_ids,
             'genre': genre,
             'coref_context': coref_context,
             'return_coref_prob': return_coref_prob,
             'language': language,
             'verbose': verbose
         })
     return Document(response)
Exemplo n.º 3
0
 def finalize_document(self, doc: Document, task_name: str):
     pos_key = prefix_match('pos', doc)
     pos: List[List[str]] = doc.get(pos_key, None)
     if pos:
         for tree, pos_per_sent in zip(doc[task_name], pos):
             tree: Tree = tree
             offset = 0
             for subtree in tree.subtrees(lambda t: t.height() == 2):
                 tag = subtree.label()
                 if tag == '_':
                     subtree.set_label(pos_per_sent[offset])
                 offset += 1
Exemplo n.º 4
0
    def parse(self, inputs: List[Input]) -> List[Document]:
        self.service_tokenizer.tokenize_inputs(inputs)  # no effects (read-only) in server pipeline

        # We shall group by models
        inputs_by_tasks = defaultdict(list)
        for i, input in enumerate(inputs):
            tasks = tuple(sorted(input.models))
            inputs_by_tasks[tasks].append(i)

        results = [Document() for _ in inputs]
        for tasks, input_ids in inputs_by_tasks.items():
            group_inputs = [inputs[i] for i in input_ids]
            group_tokens = sum([input.tokens for input in group_inputs], [])
            annotations = self.parse_sents(group_tokens, tasks)
            for k, v in annotations.items():
                # fit ELIT standard
                if k == 'ner':
                    for j, s in enumerate(v):
                        v[j] = [x[1:] + x[:1] for x in s]
                elif k == 'srl':
                    for _v in v:
                        for j, s in enumerate(_v):
                            _v[j] = [x[1:] + x[:1] for x in s]
                elif k == 'dep':
                    for j, s in enumerate(v):
                        v[j] = [(x[0] - 1, x[1]) for x in s]
            for i, input in zip(input_ids, group_inputs):
                for k, v in annotations.items():
                    results[i][k] = v[:len(input.tokens)]
                    if k == 'ner':
                        if not input.verbose:
                            for j, s in enumerate(results[i][k]):
                                results[i][k][j] = [x[:-1] for x in s]
                    elif k == 'srl':
                        if not input.verbose:
                            for _v in v:
                                for j, s in enumerate(_v):
                                    _v[j] = [x[:-1] for x in s]
                    del v[:len(input.tokens)]

        return results
Exemplo n.º 5
0
    def predict(self,
                data: Union[str, List[str]],
                batch_size: int = None,
                tasks: Optional[Union[str, List[str]]] = None,
                resolve_dependencies=True,
                **kwargs) -> Document:
        doc = Document()
        if not data:
            return doc

        if resolve_dependencies:
            # Now we decide which tasks to perform and their orders
            tasks_in_topological_order = self._tasks_in_topological_order
            task_topological_order = self._task_topological_order
            computation_graph = self._computation_graph
            target_tasks = self._resolve_task_name(tasks)
            if not target_tasks:
                target_tasks = tasks_in_topological_order
            else:
                target_topological_order = defaultdict(set)
                for task_name in target_tasks:
                    if task_name not in computation_graph:
                        continue
                    for dependency in topological_sort(computation_graph,
                                                       task_name):
                        target_topological_order[
                            task_topological_order[dependency]].add(dependency)
                target_tasks = [
                    item[1]
                    for item in sorted(target_topological_order.items())
                ]
        else:
            target_tasks = [set(tasks)] if isinstance(tasks,
                                                      list) else [{tasks}]
        if not target_tasks:
            return Document()
        # Sort target tasks within the same group in a defined order
        target_tasks = [
            sorted(x, key=lambda _x: self.config.task_names.index(_x))
            for x in target_tasks
        ]
        flatten_target_tasks = [
            self.tasks[t] for group in target_tasks for t in group
        ]
        cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks])
        sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks])
        # Now build the dataloaders and execute tasks
        first_task_name: str = list(target_tasks[0])[0]
        first_task: Task = self.tasks[first_task_name]
        encoder_transform, transform = self.build_transform(first_task)
        # Override the tokenizer config of the 1st task
        encoder_transform.sep_is_eos = sep_is_eos
        encoder_transform.cls_is_bos = cls_is_bos
        average_subwords = self.model.encoder.average_subwords
        flat = first_task.input_is_flat(data)
        if flat:
            data = [data]
        device = self.device
        samples = first_task.build_samples(data,
                                           cls_is_bos=cls_is_bos,
                                           sep_is_eos=sep_is_eos)
        dataloader = first_task.build_dataloader(samples,
                                                 transform=transform,
                                                 device=device)
        results = defaultdict(list)
        order = []
        for batch in dataloader:
            order.extend(batch[IDX])
            # Run the first task, let it make the initial batch for the successors
            output_dict = self.predict_task(first_task,
                                            first_task_name,
                                            batch,
                                            results,
                                            run_transform=True,
                                            cls_is_bos=cls_is_bos,
                                            sep_is_eos=sep_is_eos)
            # Run each task group in order
            for group_id, group in enumerate(target_tasks):
                # We could parallelize this in the future
                for task_name in group:
                    if task_name == first_task_name:
                        continue
                    output_dict = self.predict_task(self.tasks[task_name],
                                                    task_name,
                                                    batch,
                                                    results,
                                                    output_dict,
                                                    run_transform=True,
                                                    cls_is_bos=cls_is_bos,
                                                    sep_is_eos=sep_is_eos)
                if group_id == 0:
                    # We are kind of hard coding here. If the first task is a tokenizer,
                    # we need to convert the hidden and mask to token level
                    if 'token_token_span' not in batch:
                        spans = []
                        tokens = []
                        for span_per_sent, token_per_sent in zip(
                                output_dict[first_task_name]['prediction'],
                                results[first_task_name]):
                            if cls_is_bos:
                                span_per_sent = [(-1, 0)] + span_per_sent
                                token_per_sent = [BOS] + token_per_sent
                            if sep_is_eos:
                                span_per_sent = span_per_sent + [
                                    (span_per_sent[-1][0] + 1,
                                     span_per_sent[-1][1] + 1)
                                ]
                                token_per_sent = token_per_sent + [EOS]
                            # The offsets start with 0 while [CLS] is zero
                            if average_subwords:
                                span_per_sent = [
                                    list(range(x[0] + 1, x[1] + 1))
                                    for x in span_per_sent
                                ]
                            else:
                                span_per_sent = [
                                    x[0] + 1 for x in span_per_sent
                                ]
                            spans.append(span_per_sent)
                            tokens.append(token_per_sent)
                        spans = PadSequenceDataLoader.pad_data(spans,
                                                               0,
                                                               torch.long,
                                                               device=device)
                        output_dict['hidden'] = pick_tensor_for_each_token(
                            output_dict['hidden'], spans, average_subwords)
                        batch['token_token_span'] = spans
                        batch['token'] = tokens
                        # noinspection PyTypeChecker
                        batch['token_length'] = torch.tensor(
                            [len(x) for x in tokens],
                            dtype=torch.long,
                            device=device)
                        batch.pop('mask', None)
        # Put results into doc in the order of tasks
        for k in self.config.task_names:
            v = results.get(k, None)
            if v is None:
                continue
            doc[k] = reorder(v, order)
        # Allow task to perform finalization on document
        for group in target_tasks:
            for task_name in group:
                task = self.tasks[task_name]
                task.finalize_document(doc, task_name)
        # If no tok in doc, use raw input as tok
        if not any(k.startswith('tok') for k in doc):
            doc['tok'] = data
        if flat:
            for k, v in list(doc.items()):
                doc[k] = v[0]
        # If there is only one field, don't bother to wrap it
        # if len(doc) == 1:
        #     return list(doc.values())[0]
        return doc