def convert_pos_to_idx(X, tags_fn, y=False): pos_dict = load_text_file(tags_fn) result = [] if not y: for line in X: result.append(list(map(lambda x: pos_dict.index(x) + 1, line))) else: result = [pos_dict.index(x) for x in X] return result, (len(pos_dict) if y else len(pos_dict) + 1)
def convert_vrm_to_da(): data = load_text_file('../dataset/vrm/vrm-single-vrm.txt') new_data = [] for d in data: if d.endswith('D'): new_data.append('d') elif d.endswith('E'): new_data.append('e') elif d.endswith('K'): new_data.append('k') else: new_data.append('x') print(sorted(Counter(new_data).items()))
def convert_sent_to_vectors(sent, word_vec, pos_dict_fn, offset=3, embedding_len=300, sent_len=21): pos_dict = load_text_file(pos_dict_fn) result = [] pos_len = len(pos_dict) + 1 for item in sent[:-offset]: word, pos = item.split('/') try: vec_word = word_vec[word] except KeyError: vec_word = np.zeros(embedding_len) vec = np.append( vec_word, convert_one_hot_single(pos_dict.index(pos) + 1, pos_len)) result.append(vec) if (sent_len - len(result)) > 0: vec_pad = np.zeros((sent_len - len(result), embedding_len + pos_len)) return np.concatenate((vec_pad, result)) return result[:sent_len]
vec = np.append( vec_word, convert_one_hot_single(pos_dict.index(pos) + 1, pos_len)) result.append(vec) if (sent_len - len(result)) > 0: vec_pad = np.zeros((sent_len - len(result), embedding_len + pos_len)) return np.concatenate((vec_pad, result)) return result[:sent_len] # Load logger logger = Logger('crf-wv-runner') wv = load_word2vec_wv(embedding_fn) # Read words X = load_text_file(data_X_fn, as_words=True) X = [x for x in X if x[0] is not ''] # X = extract_pos_words_list(X) X = np.array([ convert_sent_to_vectors(x, wv, data_pos_tags, sent_len=sent_len) for x in X ]) print(X.shape) y = load_text_file(data_y_fn) y = [x for x in y if len(x) > 0] # Draw length plot # length_count = get_length_count(X) # draw_plot(list(length_count.keys()), list(length_count.values()))
input_X = '../data-da/swda-data-pos_X.csv' input_y = '../data-da/swda-data-pos_Y.csv' model_path = '../models/181105-simple-pos.pkl' train_rate = .85 my_labels = ['d', 'e', 'k', 'x'] my_tags = [ 'Disclosure', 'Statement-non-opinion', 'Acknowledge', 'Action-directive', 'Commits', 'Rest' ] logger = Logger('swda-simple-runner') X = list(filter(lambda x: x is not '', load_text_file(input_X))) y = list(filter(lambda x: x is not '', load_text_file(input_y))) train_index = int(len(X) * .85) X_train = X[:train_index] y_train = y[:train_index] X_test = X[train_index:] y_test = y[train_index:] print(X_train[0]) model = SimpleCrf() model.fit(X_train, y_train) model.save(model_path) y_pred = model.predict(X_test) logger.write(f'<{model_path}>')
plt.annotate(f"90% Value: {x[i_90]}", xy=(x[i_90], y[i_90]), xytext=(40, 50), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"95% Value: {x[i_95]}", xy=(x[i_95], y[i_95]), xytext=(40, 35), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"99% Value: {x[i_99]}", xy=(x[i_99], y[i_99]), xytext=(30, 20), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"End Value: {x[-1]}", xy=(x[-1], y[-1]), xytext=(-60, 70), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.show() # Load logger # logger = Logger('crf-runner') # Read words X = load_text_file(data_X_fn, as_words=True) X = [x for x in X if x[0] is not ''] X = extract_pos_words_list(X) y = load_text_file(data_y_fn) y = [x for x in y if len(x) > 0] # Draw length plot # length_count = get_length_count(X) # draw_plot(list(length_count.keys()), list(length_count.values())) # Write dict # write_dict(X, data_pos_tags) # write_dict(y, data_y_tags) # Convert pos array into index array