def test_split_on_first_whitespace(): # Only two words. assert ('hello', 'world') == utils.split_on_first_whitespace('hello world') # More than two words. assert ('hello', 'world two') == \ utils.split_on_first_whitespace('hello world two') # Should not fail on empty string. assert ('', '') == utils.split_on_first_whitespace('')
def build_LM(in_file): """ build language models for each label each line in in_file contains a label and an URL separated by a tab(\t) """ print 'Building language models...' tokenizer = get_tokenizer() language_models = {} all_grams = set() with open(in_file) as in_file_contents: for line in in_file_contents: lang, text = utils.split_on_first_whitespace(line) language_models.setdefault(lang, model.Model()) language_model = language_models[lang] for gram in tokenizer(text): all_grams.add(gram) language_model.incr_gram_count(gram) for lang in language_models: language_model = language_models[lang] for gram in all_grams: language_model.register_gram(gram) return language_models