def create_lookup_table(self, min_freq=1, tokens_file=None): if tokens_file is not None and os.path.exists(tokens_file): print('loading', tokens_file) lst = load_list(tokens_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos + 1:]) return itokens, otokens data = self.command_data wdicts = [{}, {}] for key, ss in data.items(): for seq, wd in zip([ss[0]['source'], ss[0]['target']], wdicts): for w in parenthesis_split(seq, self.delimiter, self.lparen, self.rparen): wd[w] = wd.get(w, 0) + 1 wlists = [] for wd in wdicts: wd = freq_dict_2_list(wd) wlist = [x for x, y in wd if y >= min_freq] wlists.append(wlist) i_tokens = TokenList(wlists[0]) o_tokens = TokenList(wlists[1]) if tokens_file is not None: store_list(wlists[0] + ['<@@@>'] + wlists[1], tokens_file) return i_tokens, o_tokens
def get_source_sequence(self, command_index=None, command=None): if command_index is not None: sequence = self.command_data[self.command_names[command_index]][1]['source'] elif command is not None: sequence = command else: raise_from(ValueError('provide a command_index or a command_filename. command_index takes precedence'), None) xs = parenthesis_split(sequence, self.delimiter, self.lparen, self.rparen) return xs
def load_target(self, command_index): xt = [] sequence = self.command_data[ self.command_names[command_index]][0]['target'] xt.append( list( parenthesis_split(sequence, self.delimiter, self.lparen, self.rparen))) target = pad_to_fixed(xt, self.o_tokens, self.sequence_max_length).squeeze() return target
def load_source(self, command_index): xs = [] sequence = self.command_data[ self.command_names[command_index]][0]['source'] xs.append( list( parenthesis_split(sequence, self.delimiter, self.lparen, self.rparen))) source = pad_to_fixed(xs, self.i_tokens, self.sequence_max_length).squeeze() return source
def source_length(self, source_index): command_id = self.command_names[source_index] seq = self.command_data[command_id][0]['source'] seq_length = parenthesis_split(seq, self.delimiter, self.lparen, self.rparen) return seq_length