def test_tokenise_nt(self): cases = { "Don't": [WordToken("do"), WordToken("not")], "hasn't": [WordToken("has"), WordToken("not")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_comma(self): cases = { "I, for one.": [WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"), PunctuationToken(".")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_ve(self): # I've -> I have, as there is no ambiguity cases = { "I've": [WordToken("i"), WordToken("have")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_ll(self): cases = { "I'll": [WordToken("i"), WordToken("will")], "Sam'll": [WordToken("sam"), WordToken("will")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_d(self): # he'd -> he had / he would ambiguous, so just leave it cases = { "It'd": [WordToken("it"), WordToken("'d")], "He'd": [WordToken("he"), WordToken("'d")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_bracket(self): cases = { "(I, for one.)": [PunctuationToken("("), WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"), PunctuationToken("."), PunctuationToken(")")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_s(self): # has / genitive / is ambiguous so just leave it as is cases = { "It's": [WordToken("it"), WordToken("'s")], "He's": [WordToken("he"), WordToken("'s")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_id(self): # B456F7-3 cases = { "like B456F7-3": [WordToken("like"), WordToken("B456F7-3")], "like B456F7-3-like": [WordToken("like"), WordToken("B456F7-3-like")], "8-years-old": [WordToken("8-years-old")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_hyphen(self): # "eight-year-old-child" # 8-year-old ? cf tokenise_id cases = { "eight-year-old child": [WordToken("eight"), PunctuationToken("-"), WordToken("year"), PunctuationToken("-"), WordToken("old"), WordToken("child")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_capital_middle(self): # I thought it was GREAT. -> GREAT should be capitalised # I love Paris -> Paris should be capitalised # Paris I love -> paris + i + love (I should go lowercase) cases = { "I thought it was GREAT": [WordToken("i"), WordToken("thought"), WordToken("it"), WordToken("was"), WordToken("GREAT")], "I love Paris": [WordToken("i"), WordToken("love"), WordToken("Paris")], "Paris I love": [WordToken("paris"), WordToken("i"), WordToken("love")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_slash(self): # love/hate relationship cases = { "love/hate relationship": [WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], "this love/ hate relationship": [WordToken("this"), WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], "weird-love /hate relationship": [WordToken("weird"), PunctuationToken("-"), WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def parse_text(self, text): """ Parses a text and create tokens. Args: text (str): A string representing a sentence. Returns: [token]: List of word and punctuation tokens. """ raw_tokens = nltk.word_tokenize(text) tokens = [] for raw_token in raw_tokens: if raw_token in self.punctuation_mapping: punctuation_type = self.punctuation_mapping[raw_token] tokens.append(PunctuationToken(raw_token, punctuation_type)) else: word_token = self.process_word(raw_token) if word_token is None: continue tokens.append(WordToken(word_token)) if self.POS_TAGGING: self.pos_tag(tokens) return tokens
def test_period(self): # Ph.D -> Ph.D # U.S.A. -> U.S.A + period if end of sentence, U.S.A if middle cases = { "I have a Ph.D.": [WordToken("i"), WordToken("have"), WordToken("a"), WordToken("Ph.D"), PunctuationToken(".")], "Make U.K. great again.": [WordToken("make"), WordToken("U.K"), PunctuationToken("."), WordToken("great"), WordToken("again"), PunctuationToken(".")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def train(self, training_docs, classes_count, params={ "smooth": 0.2, "neg_scope": compute_neg_punc, "bulk": False }): """all_docs: all_docs[class_index] = array of paths to a document classified as class_index""" self.__smooth_constant = params["smooth"] try: use_stopwords = params["stopwords"] except KeyError: use_stopwords = False stopwords = compute_stopwords_list() # p(c) -> count of documents classified as c / all docs # p(f|c) -> # count of f in document c + smooth # / total tokens in the document classified as c + (vocab in class c + unseen_vocab) * smooth # As we keep vocabulary for each class separate, have an array of dictionaries # vocabs[class][word] = frequency of word in the class count_docs_per_class = [0] * classes_count total_docs = len(training_docs) # total tokens per class total_tokens = [0] * classes_count # Array of vocabularies for each class # [{}] * n create a list with dictionaries of same reference, jeez vocabs = [{} for i in xrange(0, classes_count)] # Array of vocabulary size for each class vocab_sizes = [0] * classes_count # populate # total_tokens[i] - increment for each token # vocabs[i][token] - 1 if unseen, increment if seen # vocab_sizes - increment for each unseen token for tokens, label in training_docs: vocab = vocabs[label] count_docs_per_class[label] += 1 other_label = [l for l in xrange(0, classes_count) if l != label] neg_array = params["neg_scope"](tokens, params["neg_words"], *params["scope_arg"]) assert len(tokens) == len(neg_array) for i in xrange(0, len(tokens)): token = tokens[i] if isinstance(token, PunctuationToken): continue if use_stopwords and token.value in stopwords: continue negated = neg_array[i] if negated: neg_token = token token = WordToken("NOT_{}".format(token.value)) else: neg_token = WordToken("NOT_{}".format(token.value)) freq_so_far = 0 try: freq_so_far = vocab[token] except KeyError: vocab_sizes[label] += 1 vocab[token] = freq_so_far + 1 total_tokens[label] += 1 if params["augment"]: for l in other_label: other_vocab = vocabs[l] neg_freq_so_far = 0 try: neg_freq_so_far = other_vocab[neg_token] except KeyError: vocab_sizes[l] += 1 other_vocab[neg_token] = neg_freq_so_far + 1 total_tokens[l] += 1 p_c = map(lambda x: x / float(total_docs), count_docs_per_class) self.total_tokens = total_tokens self.vocabs = vocabs self.vocab_sizes = vocab_sizes self.classes_count = classes_count self.p_c = p_c
def classify(self, tokens, params={ "smooth": 0.2, "neg_scope": compute_neg_punc }): try: use_stopwords = params["stopwords"] except KeyError: use_stopwords = False stopwords = compute_stopwords_list() best_prob = 0 best_class = 0 # token -> frequency in file vocabs_in_file = {} # how many unseen unseen_vocabs_count = [0] * self.classes_count # count unseen words neg_array = params["neg_scope"](tokens, params["neg_words"], *params["scope_arg"]) assert len(neg_array) == len(tokens) for j in xrange(0, len(tokens)): token = tokens[j] if isinstance(token, PunctuationToken): continue if use_stopwords and token.value in stopwords: continue if neg_array[j]: token = WordToken("NOT_{}".format(token.value)) # if we have seen this before in this document, increment frequency try: vocabs_in_file[token] += 1 # if we haven't, then # - initialise its frequency in this document to 1 # - check if it was seen in a document of class i. If not then increment unseen_vocabs_count[i] except KeyError: vocabs_in_file[token] = 1 for i in xrange(0, self.classes_count): if token not in self.vocabs[i]: unseen_vocabs_count[i] += 1 for i in xrange(0, self.classes_count): prob = Decimal(self.p_c[i]) w = self.vocab_sizes[i] + unseen_vocabs_count[i] for j in xrange(0, len(tokens)): token = tokens[j] if neg_array[j]: token = WordToken("NOT_{}".format(token.value)) try: freq_in_c = self.vocabs[i][token] except KeyError: freq_in_c = 0 # p(f|c) -> # count of f in document c + smooth # / total tokens in the document classified as c + (vocab in class c + unseen_vocab) * smooth p_f_c = Decimal( (freq_in_c + self.__smooth_constant) / float(self.total_tokens[i] + self.__smooth_constant * w)) prob *= p_f_c if prob > best_prob: best_class = i best_prob = prob return best_class
def test_tokenise_whitespace(self): cases = { "an apple.": [WordToken("an"), WordToken("apple"), PunctuationToken(".")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_i(self): cases = { "well I think": [WordToken("well"), WordToken("i"), WordToken("think")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected