def test_tubeutils(self): """ Unit test for helper functions in WhensMyTrain """ self.assertEqual(get_line_code('Central'), 'C') self.assertEqual(get_line_code('Circle'), 'O') self.assertEqual(get_line_name('C'), 'Central') self.assertEqual(get_line_name('O'), 'Circle') for (line_code, line_name) in LINE_NAMES.keys(): self.assertEqual(line_name, get_line_name(get_line_code(line_name))) self.assertEqual(line_code, get_line_code(get_line_name(line_code)))
def import_tube_xml_to_text_corpus(): """ Creates a corpus of text data for our parser to understand requests with """ tokenizer = nltk.tokenize.regexp.WhitespaceTokenizer() line_phrases = [tokenizer.tokenize(line_name.lower()) for (_line_code, line_name) in LINE_NAMES.keys()] line_phrases = [[token for token in phrase if token not in ('&', 'and', 'city')] for phrase in line_phrases] station_phrases = [] for filename in ('tube-references.csv', 'dlr-references.csv'): station_phrases += [tokenizer.tokenize(line[1].lower()) for line in csv.reader(open('./sourcedata/%s' % filename))][1:] station_phrases = [[token for token in phrase if token not in ('&', 'and', 'city')] for phrase in station_phrases] # Organise bigram phrases - multiple wordings for stations and lines bigram_tokens = [[(token, 'TUBE_LINE_WORD') for token in phrase] for phrase in line_phrases] bigram_tokens += [[(token, 'TUBE_LINE_WORD') for token in phrase] + [('line', 'LINE')] for phrase in line_phrases] bigram_tokens += [[(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases] bigram_tokens += [[('from', 'FROM')] + [(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases] bigram_tokens += [[('to', 'TO')] + [(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases] # Unigram phrases, as a fall-back - line words and any words that appear in stations but not line names line_tokens = set(reduce(lambda a, b: a + b, [phrase for phrase in line_phrases])) station_tokens = set(reduce(lambda a, b: a + b, [phrase for phrase in station_phrases])) unigram_tokens = [[(token, 'TUBE_LINE_WORD') for token in line_tokens]] unigram_tokens += [[(token, 'STATION_WORD') for token in station_tokens if token not in line_tokens]] tagging_regexes = [ (r'^(from)$', 'FROM'), (r'^to(wards)?$', 'TO'), (r'^(and|&)$', 'AND'), (r'^city$', 'CITY'), (r'^line$', 'LINE'), (r'.*bound$', 'DIRECTION'), (r'^(please|thanks|thank|you)$', None), (r'^docklands (light rail(way)?)?$', 'DLR_LINE_NAME'), (r'.*', 'UNKNOWN'), ] regex_tagger = nltk.RegexpTagger(tagging_regexes) unigram_tagger = nltk.UnigramTagger(unigram_tokens, backoff=regex_tagger) bigram_tagger = nltk.BigramTagger(bigram_tokens, backoff=unigram_tagger) pickle.dump(bigram_tagger, open("./db/whensmytrain.tagger.obj", "w"))