def train(self, data): self.model = {} for entry in data: triples = utils.delexicalize_struct(utils.split_struct(entry['source'])) source= [] for snt in triples: sentence = ' '.join(['<SNT>'] + [t[1] for t in snt] + ['</SNT>']) source.append(sentence) source = tuple(source) if source not in self.model: self.model[source] = [] for target in entry['targets']: output = ' '.join(target['output']) self.model[source].append(output) return self.model
def predict(self, source): sentences = utils.split_struct(source) triples = utils.delexicalize_struct(sentences) struct = [] for snt in triples: sentence = ' '.join(['<SNT>'] + [t[1] for t in snt] + ['</SNT>']) struct.append(sentence) target = [] # Try to extract a full template start, end, templates = 0, len(struct), [] while start < len(struct): snts = tuple(struct[start:end]) entities, _ = self.track_entity(sentences[start:end]) if snts in self.model: pos = randint(0, len(self.model[snts]) - 1) template = self.model[snts][pos].split() for i, w in enumerate(template): if w in entities: template[i] = entities[w] target.extend(template) start = copy.copy(end) end = len(struct) else: end -= 1 # jump a triple if it is not on training set if start == end: start += 1 end = len(struct) _, entitytag = self.track_entity(sentences) for i, w in enumerate(target): if w in entitytag: target[i] = entitytag[w] return target
def load(self, path): flat = lambda struct: [ w for w in struct if w not in ['<SNT>', '</SNT>'] ] entryset = parsing.run_parser(path) data, size = [], 0 invocab, outvocab = [], [] for entry in entryset: entitymap = {b: a for a, b in entry.entitymap_to_dict().items()} if len(entry.modifiedtripleset) > 1: visited = [] for lex in entry.lexEntries: # process ordered tripleset source, delex_source, _ = load.snt_source( lex.orderedtripleset, entitymap, {}) source, delex_source = flat(source), flat(delex_source) if source not in visited and ' '.join( source).strip() != '': visited.append(source) invocab.extend(source) targets = [] for lex2 in entry.lexEntries: _, text, _ = load.snt_source( lex2.orderedtripleset, entitymap, {}) flatten = flat(text) if delex_source == flatten: trgt_preds = [] for snt in utils.split_struct(text): trgt_preds.append('<SNT>') trgt_preds.extend([t[1] for t in snt]) trgt_preds.append('</SNT>') target = { 'lid': lex2.lid, 'comment': lex2.comment, 'output': trgt_preds } targets.append(target) outvocab.extend(trgt_preds) data.append({ 'eid': entry.eid, 'category': entry.category, 'size': entry.size, 'source': source, 'targets': targets }) size += len(targets) invocab.append('unk') outvocab.append('unk') invocab = list(set(invocab)) outvocab = list(set(outvocab)) vocab = {'input': invocab, 'output': outvocab} print('Path:', path, 'Size: ', size) return data, vocab