Пример #1
0
    def tag(self, seq):
        dy.renew_cg()
        new_seq = []
        index = 1
        for entry in seq:
            if not entry.is_compound_entry:
                ce_out, encoder_states = self._predict_is_compound_entry(
                    unicode(entry.word, 'utf-8'), runtime=True)
                if np.argmax(ce_out.npvalue()) == 0:
                    entry.index = index
                    new_seq.append(entry)
                    index += 1
                else:
                    compounds = self._transduce(unicode(entry.word, 'utf-8'),
                                                encoder_states)
                    entry.index = str(index) + '-' + str(index +
                                                         len(compounds))
                    new_seq.append(entry)
                    for word in compounds:
                        from io_utils.conll import ConllEntry
                        entry = ConllEntry(index, word.encode('utf-8'),
                                           word.encode('utf-8'), '_', '_', '_',
                                           '0', '_', '_', '')
                        new_seq.append(entry)
                        index += 1

        return new_seq
Пример #2
0
    def _get_tokens(self, input_string, space_after_end_of_sentence=True):
        # print("\n")
        # print(input_string)
        tokens = []
        y_pred, _, _ = self._predict_tok(input_string, runtime=True)
        index = 0
        w = ""

        for i in range(len(input_string)):
            w += input_string[i]
            if np.argmax(y_pred[i].npvalue()) == 1:
                if w.strip() != "":
                    index += 1
                    space_after = "SpaceAfter=No"
                    if i < len(input_string) - 1:
                        if input_string[i + 1] in string.whitespace:
                            space_after = "_"
                    entry = ConllEntry(index,
                                       str(w).strip(),
                                       '_',
                                       "_",
                                       "_",
                                       "_",
                                       0,
                                       "_",
                                       "_",
                                       space_after=space_after)
                    tokens.append(entry)
                    w = ""

        if w.strip() != "":
            index += 1
            entry = ConllEntry(index,
                               str(w).strip(), '_', "_", "_", "_", 0, "_", "_",
                               "")
            tokens.append(entry)

        # set SpaceAfter=No property of last token
        if len(tokens) > 0:
            tokens[
                -1].space_after = "SpaceAfter=No" if space_after_end_of_sentence == False else "_"

        return tokens
Пример #3
0
    def _get_tokens(self, input_string):
        tokens = []
        y_pred, _, _ = self._predict_tok(input_string, runtime=True)
        index = 0
        w = ""
        for char, y in zip(input_string, y_pred):
            w += char
            if np.argmax(y.npvalue()) == 1:
                if w.strip() != "":
                    index += 1
                    entry = ConllEntry(index,
                                       w.strip().encode('utf-8'), '_', "_",
                                       "_", "_", 0, "_", "_", "")
                    tokens.append(entry)
                    w = ""
        if w.strip() != "":
            index += 1
            entry = ConllEntry(index,
                               w.strip().encode('utf-8'), '_', "_", "_", "_",
                               0, "_", "_", "")
            tokens.append(entry)

        return tokens
Пример #4
0
    def tag(self, seq):
        dy.renew_cg()
        new_seq = []
        index = 1
        for entry in seq:
            if not entry.is_compound_entry:
                ce_out, encoder_states = self._predict_is_compound_entry(
                    unicode(entry.word, 'utf-8'), runtime=True)
                if np.argmax(ce_out.npvalue()) == 0:
                    entry.index = index
                    new_seq.append(entry)
                    index += 1
                else:
                    compounds = self._transduce(unicode(entry.word, 'utf-8'),
                                                encoder_states)
                    # bug because _transduce may return empty tokens
                    valid_tokens = []
                    for token in compounds:
                        if token.strip() != "":
                            valid_tokens.append(token)
                    compounds = valid_tokens
                    if len(compounds) <= 1:
                        entry.index = index
                        new_seq.append(entry)
                        index += 1
                    else:
                        entry.index = str(index) + '-' + str(index +
                                                             len(compounds) -
                                                             1)
                        entry.is_compound_entry = True
                        entry.upos = '_'
                        entry.xpos = '_'
                        entry.attrs = '_'
                        entry.label = '_'
                        entry.head = '_'
                        entry.deps = '_'
                        new_seq.append(entry)
                        for word in compounds:
                            from io_utils.conll import ConllEntry
                            entry = ConllEntry(index, word.encode('utf-8'),
                                               word.encode('utf-8'), '_', '_',
                                               '_', '0', '_', '_', '')
                            new_seq.append(entry)
                            index += 1

        return new_seq
Пример #5
0
    def tokenize(
        self, input_string
    ):  # input string is a single string that can contain several sentences, output will be conllu-format list of sentences
        import sys
        if sys.version_info[0] == 2:
            uni_string = unicode(input_string, 'utf-8')  # because reasons
        else:
            uni_string = input_string
        offset = 0
        sentences = []
        last_proc = 0
        while offset < len(uni_string):
            proc = (offset + 1) * 100 / len(uni_string)

            while last_proc + 5 < proc:
                last_proc += 5
                sys.stdout.write(" " + str(last_proc))
                sys.stdout.flush()

            # print("Total len = "+str(len(uni_string)))
            # print("Current offset = "+str(offset))
            window = 0
            while True:  # extend window until we find an end of sentence (SX)
                window += self.config.tokenize_maximum_sequence_length
                X = uni_string[offset:min(len(uni_string), offset + window)]
                # print("    X len = "+str(len(X)))
                softmax, _, _ = self._predict(X)
                # print("    Softmax len = "+str(len(softmax)))
                # convert to labels
                labels = [
                    self.decoder_output_i2c[np.argmax(s.npvalue())]
                    for s in softmax
                ]
                # print("    Predicted label len = "+str(len(labels)))
                if "SX" in labels:
                    break
                elif offset + len(labels) >= len(
                        uni_string
                ):  # maybe we reached end of input_string without an SX, then exit as well
                    break
            offset += len(labels)

            # create sentence from labels
            sentence = []
            word = ""
            cnt = 1
            # with fopen("log.txt","a") as log:
            # log.write("\n\n")
            for i in range(len(labels)):
                # log.write("["+X[i].encode('utf-8')+"] "+labels[i]+" w=["+word.encode('utf-8')+"]\n")
                if "O" in labels[i]:
                    word = word + X[i]
                if "S" in labels[i]:
                    if X[i] in string.whitespace:  # if whitespace, skip
                        if word != "":
                            entry = ConllEntry(index=cnt,
                                               word=word.decode('utf-8'),
                                               lemma="_",
                                               upos="_",
                                               xpos="_",
                                               attrs="_",
                                               head="0",
                                               label="_",
                                               deps="_",
                                               space_after="_")
                            # log.write("   New ERROR incomplete entry ["+word.encode('utf-8')+"]\n")
                            sentence.append(entry)
                            cnt += 1
                            word = ""
                        continue
                    word += X[i]
                    space_after = "SpaceAfter=No"
                    if i < len(X) - 1:
                        if X[i + 1] in string.whitespace:
                            space_after = "_"
                    entry = ConllEntry(index=cnt,
                                       word=word.decode('utf-8'),
                                       lemma="_",
                                       upos="_",
                                       xpos="_",
                                       attrs="_",
                                       head="0",
                                       label="_",
                                       deps="_",
                                       space_after=space_after)
                    # log.write("   New entry ["+word.encode('utf-8')+"]\n")
                    sentence.append(entry)
                    cnt += 1
                    word = ""
            sentences.append(sentence)

            # for entry in sentence:
            #    print(" \t Word :"+entry.word)

        return sentences