def crf_process_data(df, max_num, tokenizer_path, tokenizer_content, path_max_len, con_max_len, OOM_Split): ''' Load the csv file and convert it to np array. ''' num, index = func.node_num(df['Leafnode']) _, max_label = func.load_data_num(df, True) num_cols = [ 'Leafnode', 'PTypeSet', 'TypeSet', 'Contentid', 'Pathid', 'Simseqid' ] features = [] word_features = [] tokenizer_path.fit_on_texts(df['Path']) tokenizer_content.fit_on_texts(df['Content']) path_encoded = tokenizer_path.texts_to_sequences(df['Path']) df['Content'] = df['Content'].str.replace( '/|\.|\?|:|=|,|<|>|&|@|\+|-|#|~|\|', ' ') df['Content'] = df['Content'].astype(str) content_encoded = tokenizer_content.texts_to_sequences(df['Content']) path_pad = tf.keras.preprocessing.sequence.pad_sequences(path_encoded, path_max_len, padding='post') content_pad = tf.keras.preprocessing.sequence.pad_sequences( content_encoded, con_max_len, padding='post') word_cols = [path_pad, content_pad] word_max_len = [path_max_len, con_max_len] for c in range(len(num_cols)): features.append( np.array(func.node_data(df[num_cols[c]], num, max_num)).astype('int32')) features[c] = np.expand_dims(features[c], -1) for c in range(len(word_cols)): word_features.append( np.array(func.node_emb(word_cols[c], num, word_max_len[c], max_num)).astype('int32')) label_array = np.array(func.label_padding(df['Label'], num, max_num)).astype('int32') feature = np.concatenate([feature for feature in features], -1) # OOM feature = np.reshape(feature, [-1, int(max_num / OOM_Split), 6]) word = [ np.reshape(word_features[c], [-1, int(max_num / OOM_Split), word_max_len[c]]) for c in range(len(word_cols)) ] feature = feature.astype('float32') label = [] for page in range(label_array.shape[0]): for node in range(label_array.shape[1]): label.append(func.one_of_n(label_array[page][node], max_label + 1)) y_onehot = np.reshape(np.array(label), [-1, int(max_num / OOM_Split), max_label + 1]) return feature, word, y_onehot, max_label
# In[ ]: if __name__ == "__main__": # How many Set model_name = "crfsuite" current_path = os.path.join(os.path.expanduser("~"), "jupyter", "Sequence_Labeling_Wrapper_Verification", "data") data_path = os.path.join(current_path, "data") set_total = len(glob.glob(os.path.join(data_path, "Set-*"))) print("Set:", set_total) # Process training file train_data, Set_dict = prepare.train_file_generate(set_total, current_path) test_data = prepare.test_file_generate(current_path) max_num_train, max_label_train = func.load_data_num(train_data, True) max_num_test = func.load_data_num(test_data, False) max_num = max(max_num_train, max_num_test) col_set_dict = dict(map(reversed, Set_dict.items())) feature_train, label_train, out_train = CRFSuite_process_data( train_data, max_num_train, max_label_train) feature_train = feature_train.tolist() label_train = label_train.tolist() X_train = [func.sent2features(feature_train)] y_train = [func.sent2labels(label_train)] # Define model crf = model() # Start training start = time.time()