示例#1
0
def test_resolve_conflicts():
    inputs = [[0, 1], [1, 1], [0, 1]]
    outputs = [[0.0], [0.5], [0.7]]
    inputs, outputs = resolve_conflicts(inputs, outputs)
    assert len(inputs) == 2
    assert len(outputs) == 2
    assert outputs[inputs.index([0, 1])] == [0.7]
示例#2
0
    def train(self, train_data):
        for sent in train_data.my_sents(self.name):
            self.ids.add_sent(sent)

        inputs = []
        outputs = []

        def add(vec, out):
            inputs.append(self.vectorize(vec))
            outputs.append([out])

        def pollute(sent, p):
            sent = sent[:]
            for _ in range(int((len(sent) + 2) / 3)):
                sent.insert(p, ':null:')
            add(sent, self.LENIENCE)

        def weight(sent):
            def calc_weight(w): return pow(len(w), 3.0)
            total_weight = 0.0
            for word in sent:
                total_weight += calc_weight(word)
            for word in sent:
                weight = 0 if word.startswith('{') else calc_weight(word)
                add([word], weight / total_weight)

        for sent in train_data.my_sents(self.name):
            add(sent, 1.0)
            weight(sent)

            # Generate samples with extra unknown tokens unless
            # the sentence is supposed to allow unknown tokens via the special :0
            if not any(word[0] == ':' and word != ':' for word in sent):
                pollute(sent, 0)
                pollute(sent, len(sent))

        for sent in train_data.other_sents(self.name):
            add(sent, 0.0)
        add([':null:'], 0.0)
        add([], 0.0)

        for sent in train_data.my_sents(self.name):
            without_entities = sent[:]
            for i, token in enumerate(without_entities):
                if token.startswith('{'):
                    without_entities[i] = ':null:'
            if without_entities != sent:
                add(without_entities, 0.0)

        inputs, outputs = resolve_conflicts(inputs, outputs)

        train_data = fann.training_data()
        train_data.set_train_data(inputs, outputs)

        for _ in range(10):
            self.configure_net()
            self.net.train_on_data(train_data, 1000, 0, 0)
            self.net.test_data(train_data)
            if self.net.get_bit_fail() == 0:
                break
示例#3
0
    def train(self, train_data):
        for sent in train_data.my_sents(self.name):
            self.ids.add_sent(sent)

        inputs = []
        outputs = []

        def add(vec, out):
            inputs.append(self.vectorize(vec))
            outputs.append([out])

        def pollute(sent, p):
            sent = sent[:]
            for _ in range(int((len(sent) + 2) / 3)):
                sent.insert(p, ':null:')
            add(sent, self.LENIENCE)

        def weight(sent):
            def calc_weight(w):
                return pow(len(w), 3.0)

            total_weight = 0.0
            for word in sent:
                total_weight += calc_weight(word)
            for word in sent:
                weight = 0 if word.startswith('{') else calc_weight(word)
                add([word], weight / total_weight)

        for sent in train_data.my_sents(self.name):
            add(sent, 1.0)
            weight(sent)
            if not any(word[0] == ':' for word in sent):
                pollute(sent, 0)
                pollute(sent, len(sent))

        for sent in train_data.other_sents(self.name):
            add(sent, 0.0)
        add([], 0.0)

        inputs, outputs = resolve_conflicts(inputs, outputs)

        train_data = fann.training_data()
        train_data.set_train_data(inputs, outputs)

        for _ in range(10):
            self.configure_net()
            self.net.train_on_data(train_data, 1000, 0, 0)
            self.net.test_data(train_data)
            if self.net.get_bit_fail() == 0:
                break
示例#4
0
    def train(self, train_data):
        for sent in train_data.my_sents(self.intent_name):
            if self.token in sent:
                for i in range(
                        sent.index(self.token) + self.dir, self.get_end(sent),
                        self.dir):
                    if sent[i][0] != '{':
                        self.ids.add_token(sent[i])

        inputs, outputs = [], []

        def pollute(sent, i, out_val):
            """Simulates multiple token words in adjacent entities"""
            for j, check_token in enumerate(sent):
                d = j - i
                if int(d > 0) - int(
                        d < 0) == self.dir and check_token.startswith('{'):
                    for pol_len in range(1, 4):
                        s = sent[:j] + [':0'] * pol_len + sent[j + 1:]
                        p = i + (pol_len - 1) * int(self.dir < 0)
                        inputs.append(self.vectorize(s, p))
                        outputs.append([out_val])

        def add_sents(sents, out_fn):
            for sent in sents:
                for i, token in enumerate(sent):
                    out_val = out_fn(token)
                    inputs.append(self.vectorize(sent, i))
                    outputs.append([out_val])
                    if out_val == 1.0:
                        pollute(sent, i, 1.0)

        add_sents(train_data.my_sents(self.intent_name),
                  lambda x: float(x == self.token))
        add_sents(train_data.other_sents(self.intent_name), lambda x: 0.0)
        inputs, outputs = resolve_conflicts(inputs, outputs)

        data = fann.training_data()
        data.set_train_data(inputs, outputs)

        for _ in range(10):
            self.configure_net()
            self.net.train_on_data(data, 1000, 0, 0)
            self.net.test_data(data)
            if self.net.get_bit_fail() == 0:
                break