def test_ngrams(self): ngrams = generate_ngrams(2, self.corpus) for first, second in [ngrams[i:i+2] for i in range(len(ngrams) - 2)]: self.assertEquals(first.split()[-1], second.split()[0])
from glob import glob from os.path import join as pjoin from sys import argv, exit from dump_speaker_data import extract_feature_utterances from sample_size_test import generate_ngrams if __name__ == "__main__": try: files = glob(pjoin(argv[1], "*.xml")) except IndexError: print "You must include the directory containing xml files" exit() utterances = extract_feature_utterances(files, "pos", "MOT") bigrams = [generate_ngrams(2, ut) for ut in utterances if len(ut) > 1] bigrams = reduce(list.__add__, bigrams, []) poses = set([word for ut in utterances for word in ut]) print "# Utterances:", len(utterances) print "# Bigrams:", len(bigrams) print "# Bigrams Types", len(set(bigrams)) print "# of parts of speech tags:", len(poses) """ WORDS # Utterances: 10445 # Bigrams: 36079 # Bigrams Types 10162 POS
def test_statistic_funs(self): ngrams = generate_ngrams(3, self.corpus) self.assertEqual(dice_stat(ngrams, ngrams), 1) self.assertEqual(dice_stat(ngrams, []), 0)