def evaluate(input_sample, gt_grams): # char level evaluation sample_lines = load_sample(input_sample, tokenize=False) u_samples, b_samples, t_samples, q_samples = get_grams(sample_lines) u_real, b_real, t_real, q_real = gt_grams print "Unigrams: %f" % percentage_real(u_samples, u_real) print "Bigrams: %f" % percentage_real(b_samples, b_real) print "Trigrams: %f" % percentage_real(t_samples, t_real) print "Quad grams: %f" % percentage_real(q_samples, q_real)
def get_gt_grams_cached(lines, dataset='training'): grams_filename = 'true-char-ngrams.pkl' if dataset == 'heldout': grams_filename = 'heldout_' + grams_filename grams_filename = FLAGS.PICKLE_PATH + '/' + grams_filename if os.path.exists(grams_filename): return model_and_data_serialization.load_picklized(grams_filename) else: grams = get_grams(lines) model_and_data_serialization.save_picklized(grams, grams_filename) return grams