예제 #1
0
    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property('Word', '-Root-')
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='CPos', value=row[3]).\
                add_property(name='Pos', value=row[4]).\
                add_property(name='Feats', value=row[5])

        for row in rows:
            # dependency
            mod = int(row[0])
            try:
                instance.add_dependency(start=int(row[6]),
                                        end=mod,
                                        label=row[7],
                                        edge_type='dep')
            except (ValueError, IndexError, KeyError):
                print('Can\'t parse dependency', file=sys.stderr)
                instance.tokens[mod].add_property('DepMissing', 'missing')
            # role
        return instance
예제 #2
0
    def create(self, rows):
        instance = NLPInstance()
        sentence = rows[0]
        # Skip <s> and dep count
        for i in range(2, len(sentence)):
            w_t_c = sentence[i].split('|')
            instance.add_token().\
                add_property(name='Word', value=w_t_c[0]).\
                add_property(name='Tag', value=w_t_c[1]).\
                add_property(name='Category', value=w_t_c[2]).\
                add_property(name='Index', value=str(i - 1))
        # instance.add_token().add_property('Word', '-Root-')

        mod = 1
        for row in rows:
            if row[0] != '<s>' and row[0] != '<\s>':
                # dependency
                try:
                    instance.add_dependency(start=int(row[1]),
                                            end=int(row[0]),
                                            label=row[2] + '_' + row[3],
                                            edge_type='dep')
                except (ValueError, IndexError, KeyError):
                    print('Can\'t parse dependency', file=sys.stderr)
                    instance.tokens[mod].add_property('DepMissing', 'missing')
                mod += 1
        return instance
예제 #3
0
    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property('Word', '-Root-')
        predicates = []  # ArrayList<Integer>()
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='Pos', value=row[3]).\
                add_property(name='Split Form', value=row[5]).\
                add_property(name='Split Lemma', value=row[6]).\
                add_property(name='Split PoS', value=row[7])
            if row[10] != '_':
                index = int(row[0])
                predicates.append(index)
                instance.add_span(index, index, row[10], 'sense')

        for row in rows:
            # dependency
            if row[8] != '_':
                instance.add_dependency(int(row[8]), int(row[0]), row[9],
                                        'dep')
            # role
            for col in range(11, len(row)):
                label = row[col]
                if label != '_':
                    pred = predicates[col - 11]
                    arg = int(row[0])
                    # if arg != pred
                    instance.add_edge(start=pred,
                                      end=arg,
                                      label=label,
                                      edge_type='role')
        return instance
예제 #4
0
    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

        predicate_count = 0
        for index, row in enumerate(rows):
            try:
                if row[9] != '-':
                    sense = row[10] + '.' + row[9]
                    instance.add_span(index, index, sense, 'sense')

                    self._extract_span04_05(rows, 11 + predicate_count, 'role',
                                            sense + ':', instance)

                    predicate_count += 1
            except IndexError:
                print('Can\'t parse file: not enough (10) column in row {0}'.
                      format(row),
                      file=sys.stderr)
                sys.exit(1)

        return instance
예제 #5
0
    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

            instance.add_span(index, index, row[1], 'ner (BIO)')

        self._extract_span00_02(rows=rows,
                                column=1,
                                field_type='ner',
                                instance=instance)

        return instance
예제 #6
0
    def load(self, file_name: str, _, __):
        """
         * Loads a corpus from a file, starting at instance <code>from</code> and ending at instance <code>to</code>
         * (exclusive). This method is required to call
         * {@link com.googlecode.whatswrong.io.CorpusFormat.Monitor#progressed(int)}
         * after each instance that was processed.
         *
         * @param file the file to load the corpus from.
         * @param from the starting instance index.
         * @param to   the end instance index.
         * @return a list of NLP instances loaded from the given file in the given interval.
         * @throws java.io.IOException if I/O goes wrong.
        """
        result = []
        with open(file_name, encoding='UTF-8') as reader:
            instance = None
            source_length = -1
            target_length = -1
            for line in reader:
                line = line.strip()
                if line.startswith('<source>'):
                    content = line.strip()[8: len(line) - 9]
                    for token in content.split():
                        instance.add_token().add_property('word', token)

                    source_length = len(instance.tokens)
                    instance.split_point = source_length
                elif line.startswith('<seg'):
                    instance = NLPInstance(render_type=RenderType.alignment)
                elif line.startswith('<translation>'):
                    content = line.strip()[13: len(line) - 14]
                    for token in content.split():
                        instance.add_token().add_property('word', token)

                    target_length = len(instance.tokens) - source_length
                elif line.startswith('<matrix>'):
                    check_eof(reader.readline())
                    for tgt in range(target_length):
                        line = check_eof(reader.readline()).strip()
                        col = line.split()
                        for src in range(1, len(col)):
                            if col[src] == '1':
                                instance.add_edge(src - 1, tgt + source_length, 'align', 'align')

                    result.append(instance)

        return result
예제 #7
0
    def create_open(rows):
        instance = NLPInstance()
        instance.add_token()
        for row in rows:
            instance.add_token(). \
                add_property('Named Entity', row[0], 10). \
                add_property('NamedEntity BBN', row[1], 11). \
                add_property('WordNet', row[2], 12)

        index = 1
        for index, row in enumerate(rows, start=1):
            # dependency
            instance.add_edge(start=int(row[3]),
                              end=index,
                              label=row[4],
                              edge_type='malt')
        return index
예제 #8
0
    def create(self, rows):
        instance = NLPInstance()
        for index, row in enumerate(rows):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index))

        predicate_count = 0
        for index, row in enumerate(rows):
            if row[1] != '-':
                sense = row[1]
                instance.add_span(index, index, sense, 'sense')

                self._extract_span04_05(rows, 2 + predicate_count, 'role',
                                        sense + ':', instance)

                predicate_count += 1
        return instance
예제 #9
0
    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property(name='Word', value='-Root-')
        predicates = []
        for row in rows:
            instance.add_token().\
                add_property(name='Word', value=row[1]).\
                add_property(name='Index', value=row[0]).\
                add_property(name='Lemma', value=row[2]).\
                add_property(name='PLemma', value=row[3]).\
                add_property(name='PoS', value=row[4]).\
                add_property(name='PPoS', value=row[5]).\
                add_property(name='Feat', value=row[6]).\
                add_property(name='PFeat', value=row[7])
            if row[13] != '_':
                index = int(row[0])
                predicates.append(index)
                instance.add_span(index, index, row[13], 'sense')

        for row in rows:
            # dependency
            if row[8] != '_':
                instance.add_dependency(start=int(row[8]),
                                        end=int(row[0]),
                                        label=row[10],
                                        edge_type='dep')
            if row[9] != '_':
                instance.add_dependency(start=int(row[9]),
                                        end=int(row[0]),
                                        label=row[11],
                                        edge_type='pdep')
            # role
            for col in range(14, len(row)):
                label = row[col]
                if label != '_':
                    pred = predicates[col - 14]
                    arg = int(row[0])
                    # if arg != pred:
                    instance.add_dependency(start=pred,
                                            end=arg,
                                            label=label,
                                            edge_type='role')
        return instance
예제 #10
0
    def create(self, rows):
        instance = NLPInstance()
        instance.add_token().add_property(name='Word', value='-Root-')
        for index, row in enumerate(rows, start=1):
            instance.add_token().\
                add_property(name='Word', value=row[0]).\
                add_property(name='Index', value=str(index)).\
                add_property(name='Pos', value=row[1])

        for mod, row in enumerate(rows, start=1):
            # dependency
            try:
                instance.add_dependency(start=int(row[2]),
                                        end=mod,
                                        label=row[3],
                                        edge_type='dep')
            except (ValueError, IndexError, KeyError):
                print('Can\'t parse dependency', file=sys.stderr)
                instance.tokens[mod].add_property('DepMissing',
                                                  'missing')  # role
        return instance
    def update_canvas(self, curr_sent_index: int):
        """ Updates the canvas based on the current state of the navigator."""
        if self._selected_gold is not None:
            if self._selected_guess is not None:
                instance = nlp_diff(
                    self._gold_corpora[self._selected_gold][curr_sent_index],
                    self._guess_corpora[self._selected_guess][curr_sent_index],
                    'eval_status_Match', 'eval_status_FN', 'eval_status_FP')
            else:
                instance = self._gold_corpora[
                    self._selected_gold][curr_sent_index]
            self.canvas.set_nlp_instance(instance)
        else:

            example = NLPInstance()
            example.add_token().add_property('Word', '[root]').add_property(
                'Index', '0')
            example.add_token().add_property('Word',
                                             'Add').add_property('Index', '1')
            example.add_token().add_property('Word',
                                             'a').add_property('Index', '2')
            example.add_token().add_property('Word', 'gold').add_property(
                'Index', '3')
            example.add_token().add_property('Word', 'corpus').add_property(
                'Index', '4')
            example.add_token().add_property('Word',
                                             '!').add_property('Index', '5')
            example.add_dependency(0, 1, 'ROOT', 'dep')
            example.add_dependency(0, 5, 'PUNC', 'dep')
            example.add_dependency(1, 4, 'OBJ', 'dep')
            example.add_dependency(4, 2, 'DET', 'dep')
            example.add_dependency(4, 3, 'MOD', 'dep')
            example.add_dependency(1, 4, 'A1', 'role')
            self.canvas.set_nlp_instance(example)
            self.canvas.filter.allowed_edge_types = set()
            self.canvas.filter.allowed_edge_types.add('dep')
            self.canvas.filter.allowed_edge_types.add('role')
            self.canvas.filter.allowed_edge_types.add('sense')
            self.canvas.filter.allowed_edge_types.add('ner')
            self.canvas.filter.allowed_edge_types.add('chunk')
            self.canvas.filter.allowed_edge_types.add('pos')
            self.canvas.filter.allowed_edge_types.add('align')

            self.canvas.filter.allowed_edge_properties.add('eval_status_FP')
            self.canvas.filter.allowed_edge_properties.add('eval_status_FN')
            self.canvas.filter.allowed_edge_properties.add('eval_status_Match')

            self.canvas.renderer.params['span.orders'] = {
                'pos': 0,
                'chunk (BIO)': 1,
                'chunk': 2,
                'ner (BIO)': 2,
                'ner': 3,
                'sense': 4,
                'role': 5,
                'phase': 5
            }
        self.canvas.fire_instance_changed()
예제 #12
0
    def load(self, file_name: str, _, __):
        """
         * Loads files from the given directory with the extensions specified by the text fields of the accessory.
         *
         * @param file the directory load the corpus from.
         * @param from the starting instance index.
         * @param to   the end instance index.
         * @return a list of NLP instances loaded from the given file in the given interval.
         * @throws java.io.IOException if I/O goes wrong.
        """
        result = []
        for txt_file_name in glob.glob(os.path.join(file_name, '*.' + self.txtExtensionField.strip())):
            filename = os.path.abspath(txt_file_name)
            prefix = filename.rsplit('.', maxsplit=1)[0]
            protein_file_name = '{0}.{1}'.format(prefix, self.proteinExtensionField.strip())
            event_file_name = '{0}.{1}'.format(prefix, self.eventExtensionField.strip())
            if os.path.exists(protein_file_name) and os.path.exists(event_file_name):
                """
                 * Loads all NLPInstances in the specified files. Creates one instance.
                 *
                 * @param txt_file_name     the text file
                 * @param protein_file_name the file with protein annotations
                 * @param event_file_name   the file with event annotations
                 * @return NLPInstance that represents the given text and annotations
                 * @throws IOException if IO goes wrong.
                """
                char_to_token = {}
                instance = NLPInstance()
                with open(txt_file_name, encoding='UTF-8') as reader:
                    current_token = instance.add_token()
                    current_token_content = ''
                    for current_index, character in enumerate(iter(functools.partial(reader.read, 1), '')):
                        char_to_token[current_index] = current_token
                        if character == ' ' or character == '\n':
                            if len(current_token_content) > 0:
                                current_token.add_property('Word', current_token_content)
                                current_token.add_property('Index', str(len(instance.tokens) - 1))
                                current_token_content = ''
                                current_token = instance.add_token()

                        else:
                            current_token_content += character

                id2token = {}
                with open(protein_file_name, encoding='UTF-8') as reader:
                    for line in reader.readlines():
                        split = line.strip().split()
                        if split[0].startswith('T'):
                            elem_id = split[0]
                            elem_type = split[1]
                            elem_from = int(split[2])
                            elem_to = int(split[3])
                            from_token = char_to_token[elem_from]
                            to_token = char_to_token[elem_to]
                            instance.add_edge(from_token.index, to_token.index, elem_type, 'protein',
                                              EdgeRenderType.span)
                            id2token[elem_id] = to_token

                with open(event_file_name, encoding='UTF-8') as reader:
                    # get event mentions and locations etc.
                    for line in reader.readlines():
                        split = line.strip().split()
                        elem_id = split[0]
                        if elem_id.startswith('T'):
                            elem_type = split[1]
                            elem_from = int(split[2])
                            elem_to = int(split[3])
                            from_token = char_to_token[elem_from]
                            to_token = char_to_token[elem_to]
                            if elem_type == 'Entity':
                                term_class = 'entity'
                            else:
                                term_class = 'event'
                            instance.add_edge(from_token.index, to_token.index, elem_type, term_class,
                                              EdgeRenderType.span)
                            id2token[elem_id] = to_token
                        elif elem_id.startswith('E'):
                            type_and_mention_id = split[1].split(':')
                            even_token = id2token[type_and_mention_id[1]]
                            id2token[elem_id] = even_token

                with open(event_file_name, encoding='UTF-8') as reader:
                    # now create the event roles
                    for line in reader.readlines():
                        split = line.split()
                        elem_id = split[0]
                        if elem_id.startswith('E'):
                            even_token = id2token[elem_id]
                            for elem in split[2:]:
                                role_and_id = elem.split(':')
                                arg_token = id2token.get(role_and_id[1])
                                if arg_token is None:
                                    raise RuntimeError(
                                        'There seems to be no mention associated with id {0} for event {1} in'
                                        ' file {2}'.format(role_and_id[1], elem_id, event_file_name))
                                instance.add_edge(even_token.index, arg_token.index, role_and_id[0], 'role',
                                                  EdgeRenderType.dependency, note=elem_id)
                result.append(instance)
        return result
예제 #13
0
 def write_tokens(self, word_type: str, tag_type: str, instance: NLPInstance):
     instance.add_token().add_property(word_type, self.label).add_property('Index', str(self.index))