예제 #1
0
def transform_data(text, word_tokenizer, char_tokenizer):
    text, numbers = clean_text(text)
    words = text.split()
    n = iter(numbers)
    origin_words = [
        next(n).replace(' ', '') if x == '0' else ('\n' if x == '|' else x)
        for x in words
    ]
    seq_len = len(words)
    chars = char_tokenizer.texts_to_sequences(words)
    words = word_tokenizer.texts_to_sequences([text])
    char_lens = [len(x) for x in chars]
    chars = pad_sequences(chars, max(char_lens))
    return words[0], seq_len, chars, char_lens, origin_words
def transform_data(data, word_tokenizer, char_tokenizer):
    """
    Args:
      data (dict): an object with tags attribute that is an object containing (startIdx, value) as (key, value) pair in which
                   value is an object with 3 key: 'type', 'end', 'prev'

    Returns:
      arrays: list of words and corresponding tags.
    """
    words = [str(word_tokenizer.texts_to_sequences(["`"])[0][0])]
    labels = [CLASSES[TAGS["normal"]]]
    characters = [str(char_tokenizer.texts_to_sequences(["`"])[0][0])]
    char_length = ['1']
    text = data['content']
    tags = data['tags']
    for start in sorted(int(x) for x in tags.keys()):
        tag = tags[str(start)]
        end = tag['end']
        _type = tag['type']
        tokens, _ = clean_text(text[start:end])
        words.extend(
            str(x) for x in word_tokenizer.texts_to_sequences([tokens])[0])
        tokens = tokens.split()
        characters.extend(
            CHARACTER_SEPARATOR.join(
                str(x)
                for x in char_tokenizer.texts_to_sequences([x.strip()])[0])
            for x in tokens)
        char_length.extend(str(len(x.strip())) for x in tokens)
        if _type == 'normal':
            labels.extend(CLASSES[TAGS[_type]] for _ in tokens)
        else:
            labels.extend(CLASSES[B_TOKEN.format(TAGS[_type])] if i ==
                          0 else CLASSES[I_TOKEN.format(TAGS[_type])]
                          for i, _ in enumerate(tokens))
    words.append(str(word_tokenizer.texts_to_sequences(["`"])[0][0]))
    characters.append(str(char_tokenizer.texts_to_sequences(["`"])[0][0]))
    labels.append(CLASSES[TAGS["normal"]])
    char_length.append("1")
    return words, characters, char_length, labels
if __name__ == '__main__':
    import json
    from data_utils.clean_text import clean_text
    from data_utils.constants import ALL_TEXTS
    import os
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, default='./data/train')
    args = parser.parse_args()
    data = [filename for filename in os.listdir(
        args.input) if filename.endswith('.json')]
    with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file:
        for filename in data:
            with open(os.path.join(args.input, filename), 'r') as in_file:
                in_data = json.load(in_file)
                file.write('\n'.join(clean_text(
                    x['content'])[0] for x in in_data))
if __name__ == '__main__':
    import json
    from data_utils.clean_text import clean_text
    from constants import ALL_TEXTS
    import os
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, default='./data/train')
    args = parser.parse_args()
    data = [
        filename for filename in os.listdir(args.input)
        if filename.endswith('.json')
    ]
    with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file:
        for filename in data:
            with open(os.path.join(args.input, filename), 'r') as in_file:
                in_data = json.load(in_file)
                file.write('\n'.join(
                    clean_text(x['content'])[0] for x in in_data))
if __name__ == '__main__':
    import json
    from data_utils.clean_text import clean_text
    from data_utils.constants import ALL_TEXTS
    import os
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, default='./data/train')
    args = parser.parse_args()
    data = [
        filename for filename in os.listdir(args.input)
        if filename.endswith('.json')
    ]
    with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file:
        for filename in data:
            with open(os.path.join(args.input, filename), 'r') as in_file:
                in_data = json.load(in_file)
                file.write('\n'.join(
                    '` {} `'.format(clean_text(x['content'])[0])
                    for x in in_data))