示例#1
0
 def test_tubeutils(self):
     """
     Unit test for helper functions in WhensMyTrain
     """
     self.assertEqual(get_line_code('Central'), 'C')
     self.assertEqual(get_line_code('Circle'), 'O')
     self.assertEqual(get_line_name('C'), 'Central')
     self.assertEqual(get_line_name('O'), 'Circle')
     for (line_code, line_name) in LINE_NAMES.keys():
         self.assertEqual(line_name, get_line_name(get_line_code(line_name)))
         self.assertEqual(line_code, get_line_code(get_line_name(line_code)))
def import_tube_xml_to_text_corpus():
    """
    Creates a corpus of text data for our parser to understand requests with
    """
    tokenizer = nltk.tokenize.regexp.WhitespaceTokenizer()
    line_phrases = [tokenizer.tokenize(line_name.lower()) for (_line_code, line_name) in LINE_NAMES.keys()]
    line_phrases = [[token for token in phrase if token not in ('&', 'and', 'city')] for phrase in line_phrases]
    station_phrases = []
    for filename in ('tube-references.csv', 'dlr-references.csv'):
        station_phrases += [tokenizer.tokenize(line[1].lower()) for line in csv.reader(open('./sourcedata/%s' % filename))][1:]
        station_phrases = [[token for token in phrase if token not in ('&', 'and', 'city')] for phrase in station_phrases]

    # Organise bigram phrases - multiple wordings for stations and lines
    bigram_tokens = [[(token, 'TUBE_LINE_WORD') for token in phrase] for phrase in line_phrases]
    bigram_tokens += [[(token, 'TUBE_LINE_WORD') for token in phrase] + [('line', 'LINE')] for phrase in line_phrases]
    bigram_tokens += [[(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases]
    bigram_tokens += [[('from', 'FROM')] + [(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases]
    bigram_tokens += [[('to', 'TO')] + [(token, 'STATION_WORD') for token in phrase] for phrase in station_phrases]

    # Unigram phrases, as a fall-back - line words and any words that appear in stations but not line names
    line_tokens = set(reduce(lambda a, b: a + b, [phrase for phrase in line_phrases]))
    station_tokens = set(reduce(lambda a, b: a + b, [phrase for phrase in station_phrases]))
    unigram_tokens = [[(token, 'TUBE_LINE_WORD') for token in line_tokens]]
    unigram_tokens += [[(token, 'STATION_WORD') for token in station_tokens if token not in line_tokens]]

    tagging_regexes = [
        (r'^(from)$', 'FROM'),
        (r'^to(wards)?$', 'TO'),
        (r'^(and|&)$', 'AND'),
        (r'^city$', 'CITY'),
        (r'^line$', 'LINE'),
        (r'.*bound$', 'DIRECTION'),
        (r'^(please|thanks|thank|you)$', None),
        (r'^docklands (light rail(way)?)?$', 'DLR_LINE_NAME'),
        (r'.*', 'UNKNOWN'),
    ]
    regex_tagger = nltk.RegexpTagger(tagging_regexes)
    unigram_tagger = nltk.UnigramTagger(unigram_tokens, backoff=regex_tagger)
    bigram_tagger = nltk.BigramTagger(bigram_tokens, backoff=unigram_tagger)
    pickle.dump(bigram_tagger, open("./db/whensmytrain.tagger.obj", "w"))