def evaluate_rule_based(input_file='silabe.test.xml'): syll_true = [s.strip() for _, s in syllabifications(input_file)] syll_pred = [syll(s.replace('-', '')) for s in syll_true] word_accuracy = np.mean([w_true == w_pred for w_true, w_pred in zip(syll_true, syll_pred)]) pairs = [(lbl_true, lbl_pred) for (s_true, s_pred) in zip(syll_true, syll_pred) for (_, _, lbl_true), (_, _, lbl_pred) in zip( all_splits(s_true), all_splits(s_pred))] hyph_true, hyph_pred = zip(*pairs) hyph_true = np.array(hyph_true) == 0 hyph_pred = np.array(hyph_pred) == 0 return (word_accuracy, accuracy_score(hyph_true, hyph_pred), f1_score(hyph_true, hyph_pred))
def evaluate_rule_based(input_file='silabe.test.xml'): syll_true = [s.strip() for _, s in syllabifications(input_file)] syll_pred = [syll(s.replace('-', '')) for s in syll_true] word_accuracy = np.mean( [w_true == w_pred for w_true, w_pred in zip(syll_true, syll_pred)]) pairs = [(lbl_true, lbl_pred) for (s_true, s_pred) in zip(syll_true, syll_pred) for (_, _, lbl_true), ( _, _, lbl_pred) in zip(all_splits(s_true), all_splits(s_pred)) ] hyph_true, hyph_pred = zip(*pairs) hyph_true = np.array(hyph_true) == 0 hyph_pred = np.array(hyph_pred) == 0 return (word_accuracy, accuracy_score(hyph_true, hyph_pred), f1_score(hyph_true, hyph_pred))
def word_to_feature_dict(word, stress, size=2, unigram=False): x = [] y = [] for left, right, label in all_splits(word): lsz = len(left) y.append(label) if unigram: # unigram features in window features = dict([(str(-1 - k), c) for k, c in enumerate(left[-size:])]) features.update(dict([(str(1 + k), c) for k, c in enumerate(right[:size])])) else: features = {} for k in xrange(size): for i in xrange(size - k): right_feature = right[i : i + k + 1] left_feature = left[lsz - i - k - 1 : lsz - i] if len(right_feature) == k + 1: features["%s-%s" % (i + 1, i + k + 1)] = right_feature if len(left_feature) == k + 1: features["%s-%s" % (-i - 1, -i - k - 1)] = left_feature x.append(features) word_stripped = word.replace("-", "") return ( x, [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))], # (np.array(y) == 0).astype(int), np.array(y, dtype=int) + 2, np.array(stress, dtype=int), )
def word_to_feature_dict(word, stress, size=2, unigram=False): x = [] y = [] for left, right, label in all_splits(word): lsz = len(left) y.append(label) if unigram: # unigram features in window features = dict([(str(-1 - k), c) for k, c in enumerate(left[-size:])]) features.update(dict([(str(1 + k), c) for k, c in enumerate(right[:size])])) else: features = {} for k in xrange(size): for i in xrange(size - k): right_feature = right[i:i + k + 1] left_feature = left[lsz - i - k - 1:lsz - i] if len(right_feature) == k + 1: features['%s-%s' % (i + 1, i + k + 1)] = right_feature if len(left_feature) == k + 1: features['%s-%s' % (-i - 1, -i - k - 1)] = left_feature x.append(features) word_stripped = word.replace('-', '') return (x, [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))], #(np.array(y) == 0).astype(int), np.array(y, dtype=int) + 2, np.array(stress, dtype=int))
def training_instances(syls): for syl in syls: for left, right, label in all_splits(syl.strip()): yield unicode(left), unicode(right), label