示例#1
0
def preprocess(
    data: List[Sequence],
    tag_lookup_table: Lookuper,
    vocabulary_look_table: Lookuper,
    seq_maxlen: Union[None, int] = None,
) -> Tuple[np.ndarray, np.ndarray, int]:
    raw_x = []
    raw_y = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text

        tag_ids = [tag_lookup_table.lookup(i) for i in tags]
        word_ids = [vocabulary_look_table.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)

    if not seq_maxlen:
        seq_maxlen = max(len(s) for s in raw_x)

    print(">>> maxlen: {}".format(seq_maxlen))

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, seq_maxlen, padding="post")  # right seq_maxlen

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      seq_maxlen,
                                                      value=0,
                                                      padding="post")

    return x, y, seq_maxlen
示例#2
0
def preprocss(data, maxlen=None, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.extra_attr[
            config['intent_field']] if config['intent_field'] not in [
                "label"
            ] else getattr(offset_data, config['intent_field'])

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    if not maxlen:
        maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    intent_np_array = np.array(intent_int_list)
    intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1)

    return x, intent_one_hot, y, intent_lookup_table
def preprocss(data, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.label

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    return x, numpy.array(intent_int_list), y, intent_lookup_table
示例#4
0
def str_to_id(string: Union[str, List[str]],
              vocabulary_look_table: Lookuper) -> List[int]:
    id_list = [vocabulary_look_table.lookup(i) for i in string]

    return id_list