def preprocess( data: List[Sequence], tag_lookup_table: Lookuper, vocabulary_look_table: Lookuper, seq_maxlen: Union[None, int] = None, ) -> Tuple[np.ndarray, np.ndarray, int]: raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookup_table.lookup(i) for i in tags] word_ids = [vocabulary_look_table.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if not seq_maxlen: seq_maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(seq_maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, seq_maxlen, padding="post") # right seq_maxlen # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, seq_maxlen, value=0, padding="post") return x, y, seq_maxlen
def preprocss(data, maxlen=None, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.extra_attr[ config['intent_field']] if config['intent_field'] not in [ "label" ] else getattr(offset_data, config['intent_field']) tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] if not maxlen: maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') intent_np_array = np.array(intent_int_list) intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1) return x, intent_one_hot, y, intent_lookup_table
def preprocss(data, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.label tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') return x, numpy.array(intent_int_list), y, intent_lookup_table
def str_to_id(string: Union[str, List[str]], vocabulary_look_table: Lookuper) -> List[int]: id_list = [vocabulary_look_table.lookup(i) for i in string] return id_list