Python NLPInstance 예제들, libwwnlp.model.nlp_instance.NLPInstance Python 예제들

예제 #1

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

 def write_spans(self, label_type: str, tag_type: str, instance: NLPInstance):
     if self.is_tag():
         span_type = tag_type
     else:
         span_type = label_type
     instance.add_span(self.get_from(), self.get_to(), self.label, span_type)
     for tree in self.children:
         tree.write_spans(label_type, tag_type, instance)

예제 #2

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

    def load(self, file_name: str, from_sent_nr: int, to_sent_nr: int):
        with open(file_name, encoding='UTF-8') as reader:
            token_preds = self._extract_predicates_from_string(self.tokens)
            dep_preds = self._extract_predicates_from_string(self.deps)
            span_preds = self._extract_predicates_from_string(self.spans)

            instance_nr = 0
            instance = NLPInstance()
            as_token = None
            as_dep = None
            as_span = None
            result = []  # [NLPInstance]
            rows = {}  # {str: [[str]]}

            self._init_rows(rows, token_preds, span_preds, dep_preds)

            while instance_nr < to_sent_nr:
                try:
                    line = check_eof(reader.readline()).strip()
                    if line.startswith('>>'):
                        # monitor.progressed(instanceNr)
                        instance_nr += 1
                        if instance_nr > from_sent_nr and instance_nr > 1:
                            self._add_edges(instance, rows, token_preds, dep_preds, span_preds)

                            result.append(instance)
                            instance = NLPInstance()
                            rows.clear()
                            self._init_rows(rows, token_preds, span_preds, dep_preds)

                    elif line.startswith('>') and instance_nr > from_sent_nr:
                        pred = line[1:]
                        as_token = token_preds.get(pred)
                        as_dep = dep_preds.get(pred)
                        as_span = span_preds.get(pred)
                    else:
                        line = line.strip()
                        if line != '' and instance_nr > from_sent_nr:
                            row = line.split('\t')
                            if as_token is not None:
                                rows[as_token].add(row)
                            if as_dep is not None:
                                rows[as_dep].add(row)
                            if as_span is not None:
                                rows[as_span].add(row)

                except EOFError:
                    break

            self._add_edges(instance, rows, token_preds, dep_preds, span_preds)

            result.append(instance)
            return result

예제 #3

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property('Word', '-Root-')
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='CPos', value=row[3]).\
                add_property(name='Pos', value=row[4]).\
                add_property(name='Feats', value=row[5])

        for row in rows:
            # dependency
            mod = int(row[0])
            try:
                instance.add_dependency(start=int(row[6]),
                                        end=mod,
                                        label=row[7],
                                        edge_type='dep')
            except (ValueError, IndexError, KeyError):
                print('Can\'t parse dependency', file=sys.stderr)
                instance.tokens[mod].add_property('DepMissing', 'missing')
            # role
        return instance

예제 #4

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

 def _extract_span04_05(rows: list, column: int, field_type: str,
                        prefix: str, instance: NLPInstance):
     begin = 0
     current_chunk = ''
     for index, row in enumerate(rows):
         chunk = row[column]
         if chunk.startswith('('):
             end = chunk.index(
                 '*'
             )  # To get ValueError when not found instead of find's -1
             current_chunk = chunk[1:end]
             begin = index
         if chunk.endswith(')'):
             instance.add_span(begin, index, prefix + current_chunk,
                               field_type)

예제 #5

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

 def _extract_span00_02(rows: list, column: int, field_type: str,
                        instance: NLPInstance):
     in_chunk = False
     begin = 0
     current_chunk = ''
     for index, row in enumerate(rows):
         chunk = row[column]
         minus = chunk.find('-')
         if minus != -1:
             bio = chunk[0:minus]
             label = chunk[minus + 1:]
             if 'B' == bio:
                 if in_chunk:
                     instance.add_span(begin, index - 1, current_chunk,
                                       field_type)
                 begin = index
                 current_chunk = label
                 in_chunk = True
         elif in_chunk:
             instance.add_span(begin, index - 1, current_chunk, field_type)
             in_chunk = False

예제 #6

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

            instance.add_span(index, index, row[1], 'pos')
            instance.add_span(index, index, row[2], 'chunk (BIO')
            instance.add_span(index, index, row[3], 'ner (BIO)')

        self._extract_span03(rows=rows,
                             column=2,
                             field_type='chunk',
                             instance=instance)
        self._extract_span03(rows=rows,
                             column=3,
                             field_type='ner',
                             instance=instance)

        return instance

예제 #7

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        sentence = rows[0]
        # Skip <s> and dep count
        for i in range(2, len(sentence)):
            w_t_c = sentence[i].split('|')
            instance.add_token().\
                add_property(name='Word', value=w_t_c[0]).\
                add_property(name='Tag', value=w_t_c[1]).\
                add_property(name='Category', value=w_t_c[2]).\
                add_property(name='Index', value=str(i - 1))
        # instance.add_token().add_property('Word', '-Root-')

        mod = 1
        for row in rows:
            if row[0] != '<s>' and row[0] != '<\s>':
                # dependency
                try:
                    instance.add_dependency(start=int(row[1]),
                                            end=int(row[0]),
                                            label=row[2] + '_' + row[3],
                                            edge_type='dep')
                except (ValueError, IndexError, KeyError):
                    print('Can\'t parse dependency', file=sys.stderr)
                    instance.tokens[mod].add_property('DepMissing', 'missing')
                mod += 1
        return instance

예제 #8

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

        predicate_count = 0
        for index, row in enumerate(rows):
            try:
                if row[9] != '-':
                    sense = row[10] + '.' + row[9]
                    instance.add_span(index, index, sense, 'sense')

                    self._extract_span04_05(rows, 11 + predicate_count, 'role',
                                            sense + ':', instance)

                    predicate_count += 1
            except IndexError:
                print('Can\'t parse file: not enough (10) column in row {0}'.
                      format(row),
                      file=sys.stderr)
                sys.exit(1)

        return instance

예제 #9

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

 def _extract_span03(rows: list, column: int, field_type: str,
                     instance: NLPInstance):
     in_chunk = False
     begin = 0
     current_chunk = ''
     index = 0
     for index, row in enumerate(rows):
         chunk = row[column]
         minus = chunk.find('-')
         if minus != -1:
             bio = chunk[0:minus]
             label = chunk[minus + 1:]
             if in_chunk:
                 # start a new chunk and finish old one
                 if 'B' == bio or 'I' == bio and label != current_chunk:
                     instance.add_span(begin, index - 1, current_chunk,
                                       field_type)
                     begin = index
                     current_chunk = label
             else:
                 in_chunk = True
                 begin = index
                 current_chunk = label
         elif in_chunk:
             instance.add_span(begin, index - 1, current_chunk, field_type)
             in_chunk = False
     if in_chunk:
         instance.add_span(begin, index - 1, current_chunk, field_type)

예제 #10

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create_open(rows):
        instance = NLPInstance()
        instance.add_token()
        for row in rows:
            instance.add_token(). \
                add_property('Named Entity', row[0], 10). \
                add_property('NamedEntity BBN', row[1], 11). \
                add_property('WordNet', row[2], 12)

        index = 1
        for index, row in enumerate(rows, start=1):
            # dependency
            instance.add_edge(start=int(row[3]),
                              end=index,
                              label=row[4],
                              edge_type='malt')
        return index

예제 #11

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property(name='Word', value='-Root-')
        for index, row in enumerate(rows, start=1):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index)).\
                add_property(name='Pos', value=row[1])

        for mod, row in enumerate(rows, start=1):
            # dependency
            try:
                instance.add_dependency(start=int(row[2]),
                                        end=mod,
                                        label=row[3],
                                        edge_type='dep')
            except (ValueError, IndexError, KeyError):
                print('Can\'t parse dependency', file=sys.stderr)
                instance.tokens[mod].add_property('DepMissing',
                                                  'missing')  # role
        return instance

예제 #12

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

    def load(self, file_name: str, _, __):
        """
         * Loads a corpus from a file, starting at instance <code>from</code> and ending at instance <code>to</code>
         * (exclusive). This method is required to call
         * {@link com.googlecode.whatswrong.io.CorpusFormat.Monitor#progressed(int)}
         * after each instance that was processed.
         *
         * @param file the file to load the corpus from.
         * @param from the starting instance index.
         * @param to   the end instance index.
         * @return a list of NLP instances loaded from the given file in the given interval.
         * @throws java.io.IOException if I/O goes wrong.
        """
        result = []
        with open(file_name, encoding='UTF-8') as reader:
            instance = None
            source_length = -1
            target_length = -1
            for line in reader:
                line = line.strip()
                if line.startswith('<source>'):
                    content = line.strip()[8: len(line) - 9]
                    for token in content.split():
                        instance.add_token().add_property('word', token)

                    source_length = len(instance.tokens)
                    instance.split_point = source_length
                elif line.startswith('<seg'):
                    instance = NLPInstance(render_type=RenderType.alignment)
                elif line.startswith('<translation>'):
                    content = line.strip()[13: len(line) - 14]
                    for token in content.split():
                        instance.add_token().add_property('word', token)

                    target_length = len(instance.tokens) - source_length
                elif line.startswith('<matrix>'):
                    check_eof(reader.readline())
                    for tgt in range(target_length):
                        line = check_eof(reader.readline()).strip()
                        col = line.split()
                        for src in range(1, len(col)):
                            if col[src] == '1':
                                instance.add_edge(src - 1, tgt + source_length, 'align', 'align')

                    result.append(instance)

        return result

예제 #13

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

    def load(self, file_name: str, from_sent_nr: int, to_sent_nr: int):

        result = []
        instance_nr = 0
        with open(file_name, encoding='UTF-8') as reader:

            for line in reader:
                line = line.strip()
                if line != '':
                    if instance_nr >= from_sent_nr:
                        tree = Tree('[root]')
                        tree.consume(tree, line)
                        tree = tree.children[0]
                        instance = NLPInstance()
                        tree.write_tokens(self.word, self.tag, instance)
                        tree.write_spans(self.phrase, self.tag, instance)
                        result.append(instance)

                    instance_nr += 1
                    if instance_nr >= to_sent_nr:
                        break

        return result

예제 #14

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

        predicate_count = 0
        for index, row in enumerate(rows):
            if row[1] != '-':
                sense = row[1]
                instance.add_span(index, index, sense, 'sense')

                self._extract_span04_05(rows, 2 + predicate_count, 'role',
                                        sense + ':', instance)

                predicate_count += 1
        return instance

예제 #15

0

파일 보기

파일: filter.py 프로젝트: ppke-nlpg/whats-wrong-python

    def filter(self, original: NLPInstance) -> NLPInstance:
        """Filter an NLP instance.

        Filters the tokens and then removes edges that have tokens which were
        filtered out. Also filters out edges and then filter out tokens without
        edges if self.is_collaps is true.

        Filters out all edges that don't have an allowed prefix and postfix
        type. Filters out all edges that don't have a label that contains one
        of the allowed label substrings. If the set of allowed substrings is
        empty then the original set of edges is returned as is.

        Note on types:
        tokens ({Token})
        old2new ({Token: Token})
        new2old ({Token: Token})
        updated_tokens ([Token])
        updated_edges ({Edge})
        updated_split_points ([int])

        Args:
            original (NLPInstance): The original nlp instance.

        Returns:
            NLPInstance: The filtered NLPInstance.
        """
        # Filter edges by connecting token properties, edge label, edge type, edge property
        edges = {
            edge
            for edge in original.get_edges() if self._is_edge_allowed(edge)
        }

        # Only allow edges on the path of tokens having allowed props
        if self.use_path:
            edges = self._calculate_paths(edges)

        # Unless collape is True all token is shown!
        tokens = original.tokens

        # Filter tokens for edges
        if self.collapse:
            # Collapse tokens to the allowed edges
            tokens = set()
            if self.collapse:
                for edge in edges:
                    if edge.render_type == EdgeRenderType.dependency:
                        tokens.add(edge.start)
                        tokens.add(edge.end)
                    elif edge.render_type == EdgeRenderType.span:
                        for i in range(edge.start.index, edge.end.index + 1):
                            tokens.add(original.get_token(i))

        # Token filter: reduce the list of tokens explicitly allowed ones (or keep all remaining)
        tokens = {
            token
            for token in tokens if self._token_has_allowed_prop(
                token, self.tok_allowed_token_propvals,
                self.tok_propvals_whole_word)
        }

        # XXX Why do we need to create new tokens?
        # Compute bidirectional mapping between the new and old indexes and create new tokens
        old2new, new2old, updated_tokens = {}, {}, []
        for i, token in enumerate(sorted(
                tokens,
                key=attrgetter('index'))):  # This sould be non-capital index!
            new_tok = Token(i)
            new_tok.merge(
                original.tokens[token.index],
                forbidden_token_properties=self.forbidden_token_properties)
            old2new[token] = new_tok
            new2old[new_tok] = token
            updated_tokens.append(new_tok)

        # XXX Why do we need to create new edges?
        # Update edges and remove those that have vertices not in the new vertex set
        updated_edges = set()
        for edge in (e for e in edges
                     if e.start in old2new and e.end in old2new):
            updated_edges.add(
                Edge(start=old2new[edge.start],
                     end=old2new[edge.end],
                     label=edge.label,
                     note=edge.note,
                     edge_type=edge.edge_type,
                     render_type=edge.render_type,
                     description=edge.description,
                     properties=edge.properties))

        # Find new split points (have to be changed because instance has new token sequence)
        new_token_index = 0
        old_split_point = original.split_point
        new_tok = updated_tokens[new_token_index]
        old_token = new2old[new_tok]
        max_index_of_updated_tokens = len(updated_tokens) - 1
        while new_token_index < max_index_of_updated_tokens and old_token.index < old_split_point:
            new_token_index += 1
            new_tok = updated_tokens[new_token_index]
            old_token = new2old[new_tok]
        updated_split_point = new_token_index

        return NLPInstance(tokens=updated_tokens,
                           edges=updated_edges,
                           render_type=original.render_type,
                           split_point=updated_split_point)

예제 #16

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property(name='Word', value='-Root-')
        predicates = []
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='PLemma', value=row[3]).\
                add_property(name='PoS', value=row[4]).\
                add_property(name='PPoS', value=row[5]).\
                add_property(name='Feat', value=row[6]).\
                add_property(name='PFeat', value=row[7])
            if row[13] != '_':
                index = int(row[0])
                predicates.append(index)
                instance.add_span(index, index, row[13], 'sense')

        for row in rows:
            # dependency
            if row[8] != '_':
                instance.add_dependency(start=int(row[8]),
                                        end=int(row[0]),
                                        label=row[10],
                                        edge_type='dep')
            if row[9] != '_':
                instance.add_dependency(start=int(row[9]),
                                        end=int(row[0]),
                                        label=row[11],
                                        edge_type='pdep')
            # role
            for col in range(14, len(row)):
                label = row[col]
                if label != '_':
                    pred = predicates[col - 14]
                    arg = int(row[0])
                    # if arg != pred:
                    instance.add_dependency(start=pred,
                                            end=arg,
                                            label=label,
                                            edge_type='role')
        return instance

예제 #17

0

파일 보기

파일: tab_processor.py 프로젝트: ppke-nlpg/whats-wrong-python

    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property('Word', '-Root-')
        predicates = []  # ArrayList<Integer>()
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='Pos', value=row[3]).\
                add_property(name='Split Form', value=row[5]).\
                add_property(name='Split Lemma', value=row[6]).\
                add_property(name='Split PoS', value=row[7])
            if row[10] != '_':
                index = int(row[0])
                predicates.append(index)
                instance.add_span(index, index, row[10], 'sense')

        for row in rows:
            # dependency
            if row[8] != '_':
                instance.add_dependency(int(row[8]), int(row[0]), row[9],
                                        'dep')
            # role
            for col in range(11, len(row)):
                label = row[col]
                if label != '_':
                    pred = predicates[col - 11]
                    arg = int(row[0])
                    # if arg != pred
                    instance.add_edge(start=pred,
                                      end=arg,
                                      label=label,
                                      edge_type='role')
        return instance

예제 #18

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

    def load(self, file_name: str, from_sentence_nr: int, to_sentence_nr: int):
        with open(file_name, encoding='UTF-8') as reader:
            """
             * Skip past the next aligned segment pair in the given reader.
             *
             * @throws EndOfInputException if there was no aligned segment pair to skip because we're
             *         already at the end of the given reader
            """
            # There are three lines per segment pair.
            for _ in range(3 * from_sentence_nr):
                try:
                    check_eof(reader.readline())
                except EOFError:
                    break

            result = []  # ArrayList<NLPInstance>
            for i in range(from_sentence_nr, to_sentence_nr):
                try:
                    """
                     * @return the next aligned segment pair, loaded from the given reader
                     *
                     * @throws EndOfInputException if no aligned segment pair could be loaded because we're already
                     *         at the end of the given reader
                    """
                    """
                     There are three lines per segment pair.

                     The first line gives the segment index, source and target lengths (which we can count
                     ourselves), and an alignment score. Skip this line (or throw an exception if there are no
                     more lines).
                    """
                    check_eof(reader.readline())

                    tokens = []
                    """
                     * a list of one-based {source-token-index, target-token-index} pairs
                    """
                    alignment_edges = []  # [(int, int)]

                    # String line;

                    # The second line contains the source segment, tokenized, with no adornment.
                    tokens.append(check_eof(reader.readline()).strip().split())
                    tokens.append([])

                    """
                     The third line contains the tokens of the target segment, starting with the pseudo-token
                     "NULL", with each token followed by a whitespace-delimited list (in curly braces nested
                     in parentheses) of the 1-based indices of the source tokens aligned to it, e.g.:

                     NULL ({ 2 }) customization ({ 1 }) of ({ }) tasks ({ 3 4 })
                    """
                    # Strip newline and space and reappend space for later regex
                    line = check_eof(reader.readline()).rstrip() + ' '

                    # start from index 1 to skip the NULL token and empty string at the EOL
                    for ind, token_with_aligned_indices in enumerate(line.split(' }) ')[1:-1], start=1):
                        splitted1, splitted2 = token_with_aligned_indices.split(' ({')
                        tokens[1].append(splitted1)
                        aligned_index_list_as_string = splitted2.strip()

                        """
                         we need to handle the empty list specially, because the split method on the empty
                         string returns a singleton array containing the empty string, but here an empty
                         array is what we want
                        """
                        aligned_indices_as_strings = []
                        if len(aligned_index_list_as_string) > 0:
                            aligned_indices_as_strings = aligned_index_list_as_string.split(' ')

                        for aligned_index_as_string in aligned_indices_as_strings:
                            alignment_edges.append((int(aligned_index_as_string), ind))

                    # now we're ready to make the NLPInstance
                    instance = NLPInstance(render_type=RenderType.alignment)
                    if self._reverseCheckBox:
                        self.make_instance(instance, tokens[1], tokens[0], ((e2, e1) for e1, e2 in alignment_edges))
                    else:
                        self.make_instance(instance, tokens[0], tokens[1], alignment_edges)

                    result.append(instance)
                except EOFError:
                    break

        return result

예제 #19

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

 def write_tokens(self, word_type: str, tag_type: str, instance: NLPInstance):
     instance.add_token().add_property(word_type, self.label).add_property('Index', str(self.index))

예제 #20

0

파일 보기

파일: corpus_navigator.py 프로젝트: ppke-nlpg/whats-wrong-python

    def update_canvas(self, curr_sent_index: int):
        """ Updates the canvas based on the current state of the navigator."""
        if self._selected_gold is not None:
            if self._selected_guess is not None:
                instance = nlp_diff(
                    self._gold_corpora[self._selected_gold][curr_sent_index],
                    self._guess_corpora[self._selected_guess][curr_sent_index],
                    'eval_status_Match', 'eval_status_FN', 'eval_status_FP')
            else:
                instance = self._gold_corpora[
                    self._selected_gold][curr_sent_index]
            self.canvas.set_nlp_instance(instance)
        else:

            example = NLPInstance()
            example.add_token().add_property('Word', '[root]').add_property(
                'Index', '0')
            example.add_token().add_property('Word',
                                             'Add').add_property('Index', '1')
            example.add_token().add_property('Word',
                                             'a').add_property('Index', '2')
            example.add_token().add_property('Word', 'gold').add_property(
                'Index', '3')
            example.add_token().add_property('Word', 'corpus').add_property(
                'Index', '4')
            example.add_token().add_property('Word',
                                             '!').add_property('Index', '5')
            example.add_dependency(0, 1, 'ROOT', 'dep')
            example.add_dependency(0, 5, 'PUNC', 'dep')
            example.add_dependency(1, 4, 'OBJ', 'dep')
            example.add_dependency(4, 2, 'DET', 'dep')
            example.add_dependency(4, 3, 'MOD', 'dep')
            example.add_dependency(1, 4, 'A1', 'role')
            self.canvas.set_nlp_instance(example)
            self.canvas.filter.allowed_edge_types = set()
            self.canvas.filter.allowed_edge_types.add('dep')
            self.canvas.filter.allowed_edge_types.add('role')
            self.canvas.filter.allowed_edge_types.add('sense')
            self.canvas.filter.allowed_edge_types.add('ner')
            self.canvas.filter.allowed_edge_types.add('chunk')
            self.canvas.filter.allowed_edge_types.add('pos')
            self.canvas.filter.allowed_edge_types.add('align')

            self.canvas.filter.allowed_edge_properties.add('eval_status_FP')
            self.canvas.filter.allowed_edge_properties.add('eval_status_FN')
            self.canvas.filter.allowed_edge_properties.add('eval_status_Match')

            self.canvas.renderer.params['span.orders'] = {
                'pos': 0,
                'chunk (BIO)': 1,
                'chunk': 2,
                'ner (BIO)': 2,
                'ner': 3,
                'sense': 4,
                'role': 5,
                'phase': 5
            }
        self.canvas.fire_instance_changed()

예제 #21

0

파일 보기

파일: other_formats.py 프로젝트: ppke-nlpg/whats-wrong-python

    def load(self, file_name: str, _, __):
        """
         * Loads files from the given directory with the extensions specified by the text fields of the accessory.
         *
         * @param file the directory load the corpus from.
         * @param from the starting instance index.
         * @param to   the end instance index.
         * @return a list of NLP instances loaded from the given file in the given interval.
         * @throws java.io.IOException if I/O goes wrong.
        """
        result = []
        for txt_file_name in glob.glob(os.path.join(file_name, '*.' + self.txtExtensionField.strip())):
            filename = os.path.abspath(txt_file_name)
            prefix = filename.rsplit('.', maxsplit=1)[0]
            protein_file_name = '{0}.{1}'.format(prefix, self.proteinExtensionField.strip())
            event_file_name = '{0}.{1}'.format(prefix, self.eventExtensionField.strip())
            if os.path.exists(protein_file_name) and os.path.exists(event_file_name):
                """
                 * Loads all NLPInstances in the specified files. Creates one instance.
                 *
                 * @param txt_file_name     the text file
                 * @param protein_file_name the file with protein annotations
                 * @param event_file_name   the file with event annotations
                 * @return NLPInstance that represents the given text and annotations
                 * @throws IOException if IO goes wrong.
                """
                char_to_token = {}
                instance = NLPInstance()
                with open(txt_file_name, encoding='UTF-8') as reader:
                    current_token = instance.add_token()
                    current_token_content = ''
                    for current_index, character in enumerate(iter(functools.partial(reader.read, 1), '')):
                        char_to_token[current_index] = current_token
                        if character == ' ' or character == '\n':
                            if len(current_token_content) > 0:
                                current_token.add_property('Word', current_token_content)
                                current_token.add_property('Index', str(len(instance.tokens) - 1))
                                current_token_content = ''
                                current_token = instance.add_token()

                        else:
                            current_token_content += character

                id2token = {}
                with open(protein_file_name, encoding='UTF-8') as reader:
                    for line in reader.readlines():
                        split = line.strip().split()
                        if split[0].startswith('T'):
                            elem_id = split[0]
                            elem_type = split[1]
                            elem_from = int(split[2])
                            elem_to = int(split[3])
                            from_token = char_to_token[elem_from]
                            to_token = char_to_token[elem_to]
                            instance.add_edge(from_token.index, to_token.index, elem_type, 'protein',
                                              EdgeRenderType.span)
                            id2token[elem_id] = to_token

                with open(event_file_name, encoding='UTF-8') as reader:
                    # get event mentions and locations etc.
                    for line in reader.readlines():
                        split = line.strip().split()
                        elem_id = split[0]
                        if elem_id.startswith('T'):
                            elem_type = split[1]
                            elem_from = int(split[2])
                            elem_to = int(split[3])
                            from_token = char_to_token[elem_from]
                            to_token = char_to_token[elem_to]
                            if elem_type == 'Entity':
                                term_class = 'entity'
                            else:
                                term_class = 'event'
                            instance.add_edge(from_token.index, to_token.index, elem_type, term_class,
                                              EdgeRenderType.span)
                            id2token[elem_id] = to_token
                        elif elem_id.startswith('E'):
                            type_and_mention_id = split[1].split(':')
                            even_token = id2token[type_and_mention_id[1]]
                            id2token[elem_id] = even_token

                with open(event_file_name, encoding='UTF-8') as reader:
                    # now create the event roles
                    for line in reader.readlines():
                        split = line.split()
                        elem_id = split[0]
                        if elem_id.startswith('E'):
                            even_token = id2token[elem_id]
                            for elem in split[2:]:
                                role_and_id = elem.split(':')
                                arg_token = id2token.get(role_and_id[1])
                                if arg_token is None:
                                    raise RuntimeError(
                                        'There seems to be no mention associated with id {0} for event {1} in'
                                        ' file {2}'.format(role_and_id[1], elem_id, event_file_name))
                                instance.add_edge(even_token.index, arg_token.index, role_and_id[0], 'role',
                                                  EdgeRenderType.dependency, note=elem_id)
                result.append(instance)
        return result