예제 #1
0
def crf_process_data(df, max_num, tokenizer_path, tokenizer_content,
                     path_max_len, con_max_len, OOM_Split):
    '''
    Load the csv file and convert it to np array.
    '''
    num, index = func.node_num(df['Leafnode'])
    _, max_label = func.load_data_num(df, True)
    num_cols = [
        'Leafnode', 'PTypeSet', 'TypeSet', 'Contentid', 'Pathid', 'Simseqid'
    ]
    features = []
    word_features = []
    tokenizer_path.fit_on_texts(df['Path'])
    tokenizer_content.fit_on_texts(df['Content'])
    path_encoded = tokenizer_path.texts_to_sequences(df['Path'])
    df['Content'] = df['Content'].str.replace(
        '/|\.|\?|:|=|,|<|>|&|@|\+|-|#|~|\|', ' ')
    df['Content'] = df['Content'].astype(str)
    content_encoded = tokenizer_content.texts_to_sequences(df['Content'])
    path_pad = tf.keras.preprocessing.sequence.pad_sequences(path_encoded,
                                                             path_max_len,
                                                             padding='post')
    content_pad = tf.keras.preprocessing.sequence.pad_sequences(
        content_encoded, con_max_len, padding='post')

    word_cols = [path_pad, content_pad]
    word_max_len = [path_max_len, con_max_len]

    for c in range(len(num_cols)):
        features.append(
            np.array(func.node_data(df[num_cols[c]], num,
                                    max_num)).astype('int32'))
        features[c] = np.expand_dims(features[c], -1)

    for c in range(len(word_cols)):
        word_features.append(
            np.array(func.node_emb(word_cols[c], num, word_max_len[c],
                                   max_num)).astype('int32'))
    label_array = np.array(func.label_padding(df['Label'], num,
                                              max_num)).astype('int32')
    feature = np.concatenate([feature for feature in features], -1)

    # OOM
    feature = np.reshape(feature, [-1, int(max_num / OOM_Split), 6])

    word = [
        np.reshape(word_features[c],
                   [-1, int(max_num / OOM_Split), word_max_len[c]])
        for c in range(len(word_cols))
    ]

    feature = feature.astype('float32')
    label = []
    for page in range(label_array.shape[0]):
        for node in range(label_array.shape[1]):
            label.append(func.one_of_n(label_array[page][node], max_label + 1))
    y_onehot = np.reshape(np.array(label),
                          [-1, int(max_num / OOM_Split), max_label + 1])
    return feature, word, y_onehot, max_label
예제 #2
0
# In[ ]:

if __name__ == "__main__":
    # How many Set
    model_name = "crfsuite"
    current_path = os.path.join(os.path.expanduser("~"), "jupyter",
                                "Sequence_Labeling_Wrapper_Verification",
                                "data")
    data_path = os.path.join(current_path, "data")
    set_total = len(glob.glob(os.path.join(data_path, "Set-*")))
    print("Set:", set_total)
    # Process training file
    train_data, Set_dict = prepare.train_file_generate(set_total, current_path)
    test_data = prepare.test_file_generate(current_path)
    max_num_train, max_label_train = func.load_data_num(train_data, True)
    max_num_test = func.load_data_num(test_data, False)
    max_num = max(max_num_train, max_num_test)
    col_set_dict = dict(map(reversed, Set_dict.items()))
    feature_train, label_train, out_train = CRFSuite_process_data(
        train_data, max_num_train, max_label_train)
    feature_train = feature_train.tolist()
    label_train = label_train.tolist()
    X_train = [func.sent2features(feature_train)]
    y_train = [func.sent2labels(label_train)]

    # Define model
    crf = model()

    # Start training
    start = time.time()