Пример #1
0
def convert_pos_to_idx(X, tags_fn, y=False):
    pos_dict = load_text_file(tags_fn)
    result = []
    if not y:
        for line in X:
            result.append(list(map(lambda x: pos_dict.index(x) + 1, line)))
    else:
        result = [pos_dict.index(x) for x in X]
    return result, (len(pos_dict) if y else len(pos_dict) + 1)
Пример #2
0
def convert_vrm_to_da():
    data = load_text_file('../dataset/vrm/vrm-single-vrm.txt')
    new_data = []
    for d in data:
        if d.endswith('D'):
            new_data.append('d')
        elif d.endswith('E'):
            new_data.append('e')
        elif d.endswith('K'):
            new_data.append('k')
        else:
            new_data.append('x')

    print(sorted(Counter(new_data).items()))
Пример #3
0
def convert_sent_to_vectors(sent,
                            word_vec,
                            pos_dict_fn,
                            offset=3,
                            embedding_len=300,
                            sent_len=21):
    pos_dict = load_text_file(pos_dict_fn)
    result = []
    pos_len = len(pos_dict) + 1
    for item in sent[:-offset]:
        word, pos = item.split('/')
        try:
            vec_word = word_vec[word]
        except KeyError:
            vec_word = np.zeros(embedding_len)
        vec = np.append(
            vec_word, convert_one_hot_single(pos_dict.index(pos) + 1, pos_len))
        result.append(vec)

    if (sent_len - len(result)) > 0:
        vec_pad = np.zeros((sent_len - len(result), embedding_len + pos_len))
        return np.concatenate((vec_pad, result))
    return result[:sent_len]
Пример #4
0
        vec = np.append(
            vec_word, convert_one_hot_single(pos_dict.index(pos) + 1, pos_len))
        result.append(vec)

    if (sent_len - len(result)) > 0:
        vec_pad = np.zeros((sent_len - len(result), embedding_len + pos_len))
        return np.concatenate((vec_pad, result))
    return result[:sent_len]


# Load logger
logger = Logger('crf-wv-runner')
wv = load_word2vec_wv(embedding_fn)

# Read words
X = load_text_file(data_X_fn, as_words=True)

X = [x for x in X if x[0] is not '']
# X = extract_pos_words_list(X)

X = np.array([
    convert_sent_to_vectors(x, wv, data_pos_tags, sent_len=sent_len) for x in X
])
print(X.shape)

y = load_text_file(data_y_fn)
y = [x for x in y if len(x) > 0]

# Draw length plot
# length_count = get_length_count(X)
# draw_plot(list(length_count.keys()), list(length_count.values()))
Пример #5
0
input_X = '../data-da/swda-data-pos_X.csv'
input_y = '../data-da/swda-data-pos_Y.csv'
model_path = '../models/181105-simple-pos.pkl'

train_rate = .85

my_labels = ['d', 'e', 'k', 'x']
my_tags = [
    'Disclosure', 'Statement-non-opinion', 'Acknowledge', 'Action-directive',
    'Commits', 'Rest'
]

logger = Logger('swda-simple-runner')

X = list(filter(lambda x: x is not '', load_text_file(input_X)))
y = list(filter(lambda x: x is not '', load_text_file(input_y)))

train_index = int(len(X) * .85)
X_train = X[:train_index]
y_train = y[:train_index]
X_test = X[train_index:]
y_test = y[train_index:]
print(X_train[0])

model = SimpleCrf()
model.fit(X_train, y_train)
model.save(model_path)

y_pred = model.predict(X_test)
logger.write(f'<{model_path}>')
Пример #6
0
  plt.annotate(f"90% Value: {x[i_90]}",
               xy=(x[i_90], y[i_90]), xytext=(40, 50), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
  plt.annotate(f"95% Value: {x[i_95]}",
               xy=(x[i_95], y[i_95]), xytext=(40, 35), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
  plt.annotate(f"99% Value: {x[i_99]}",
               xy=(x[i_99], y[i_99]), xytext=(30, 20), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
  plt.annotate(f"End Value: {x[-1]}",
               xy=(x[-1], y[-1]), xytext=(-60, 70), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
  plt.show()


# Load logger
# logger = Logger('crf-runner')

# Read words
X = load_text_file(data_X_fn, as_words=True)
X = [x for x in X if x[0] is not '']
X = extract_pos_words_list(X)
y = load_text_file(data_y_fn)
y = [x for x in y if len(x) > 0]

# Draw length plot
# length_count = get_length_count(X)
# draw_plot(list(length_count.keys()), list(length_count.values()))

# Write dict
# write_dict(X, data_pos_tags)
# write_dict(y, data_y_tags)


# Convert pos array into index array