def test_resolve_conflicts(): inputs = [[0, 1], [1, 1], [0, 1]] outputs = [[0.0], [0.5], [0.7]] inputs, outputs = resolve_conflicts(inputs, outputs) assert len(inputs) == 2 assert len(outputs) == 2 assert outputs[inputs.index([0, 1])] == [0.7]
def train(self, train_data): for sent in train_data.my_sents(self.name): self.ids.add_sent(sent) inputs = [] outputs = [] def add(vec, out): inputs.append(self.vectorize(vec)) outputs.append([out]) def pollute(sent, p): sent = sent[:] for _ in range(int((len(sent) + 2) / 3)): sent.insert(p, ':null:') add(sent, self.LENIENCE) def weight(sent): def calc_weight(w): return pow(len(w), 3.0) total_weight = 0.0 for word in sent: total_weight += calc_weight(word) for word in sent: weight = 0 if word.startswith('{') else calc_weight(word) add([word], weight / total_weight) for sent in train_data.my_sents(self.name): add(sent, 1.0) weight(sent) # Generate samples with extra unknown tokens unless # the sentence is supposed to allow unknown tokens via the special :0 if not any(word[0] == ':' and word != ':' for word in sent): pollute(sent, 0) pollute(sent, len(sent)) for sent in train_data.other_sents(self.name): add(sent, 0.0) add([':null:'], 0.0) add([], 0.0) for sent in train_data.my_sents(self.name): without_entities = sent[:] for i, token in enumerate(without_entities): if token.startswith('{'): without_entities[i] = ':null:' if without_entities != sent: add(without_entities, 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) train_data = fann.training_data() train_data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(train_data, 1000, 0, 0) self.net.test_data(train_data) if self.net.get_bit_fail() == 0: break
def train(self, train_data): for sent in train_data.my_sents(self.name): self.ids.add_sent(sent) inputs = [] outputs = [] def add(vec, out): inputs.append(self.vectorize(vec)) outputs.append([out]) def pollute(sent, p): sent = sent[:] for _ in range(int((len(sent) + 2) / 3)): sent.insert(p, ':null:') add(sent, self.LENIENCE) def weight(sent): def calc_weight(w): return pow(len(w), 3.0) total_weight = 0.0 for word in sent: total_weight += calc_weight(word) for word in sent: weight = 0 if word.startswith('{') else calc_weight(word) add([word], weight / total_weight) for sent in train_data.my_sents(self.name): add(sent, 1.0) weight(sent) if not any(word[0] == ':' for word in sent): pollute(sent, 0) pollute(sent, len(sent)) for sent in train_data.other_sents(self.name): add(sent, 0.0) add([], 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) train_data = fann.training_data() train_data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(train_data, 1000, 0, 0) self.net.test_data(train_data) if self.net.get_bit_fail() == 0: break
def train(self, train_data): for sent in train_data.my_sents(self.intent_name): if self.token in sent: for i in range( sent.index(self.token) + self.dir, self.get_end(sent), self.dir): if sent[i][0] != '{': self.ids.add_token(sent[i]) inputs, outputs = [], [] def pollute(sent, i, out_val): """Simulates multiple token words in adjacent entities""" for j, check_token in enumerate(sent): d = j - i if int(d > 0) - int( d < 0) == self.dir and check_token.startswith('{'): for pol_len in range(1, 4): s = sent[:j] + [':0'] * pol_len + sent[j + 1:] p = i + (pol_len - 1) * int(self.dir < 0) inputs.append(self.vectorize(s, p)) outputs.append([out_val]) def add_sents(sents, out_fn): for sent in sents: for i, token in enumerate(sent): out_val = out_fn(token) inputs.append(self.vectorize(sent, i)) outputs.append([out_val]) if out_val == 1.0: pollute(sent, i, 1.0) add_sents(train_data.my_sents(self.intent_name), lambda x: float(x == self.token)) add_sents(train_data.other_sents(self.intent_name), lambda x: 0.0) inputs, outputs = resolve_conflicts(inputs, outputs) data = fann.training_data() data.set_train_data(inputs, outputs) for _ in range(10): self.configure_net() self.net.train_on_data(data, 1000, 0, 0) self.net.test_data(data) if self.net.get_bit_fail() == 0: break