def __init__(self, fname, load=True): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() self.model_file = fname if load: self.load(self.model_file)
def __init__(self, fname, load=True): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() # # initiate a glove self.wv = api.load('glove-twitter-25') self.model_file = fname if load: self.load(self.model_file)
def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' cp_essay_feats = list(essay_feats) # Copy as we do an inplace shuffle below tag_freq = defaultdict(int) for essay in cp_essay_feats: for taggged_sentence in essay.sentences: for wd in taggged_sentence: fs_tags = self.__get_tags_(wd.tags) tag_freq[fs_tags] +=1 self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold]) self.model = AveragedPerceptron(self.classes) for iter_ in range(nr_iter): class2predictions = defaultdict(list) class2tags = defaultdict(list) for essay_ix, essay in enumerate(cp_essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary shared_features = dict(wd.features.items()) # get all tagger predictions for previous 2 tags self._add_secondary_tag_features(shared_features, prev) tagger_feats = dict(shared_features.items()) # add more in depth features for this tag actual = self.__get_tags_(wd.tags) if self.use_tag_features: self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2]) guess = self.model.predict(tagger_feats) self.model.update(actual, guess, tagger_feats) prev.append(guess) for cls in self.individual_tags: class2predictions[cls].append( 1 if cls in guess else 0 ) class2tags[cls].append( 1 if cls in actual else 0) random.shuffle(cp_essay_feats) class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions) micro_metrics = micro_rpfa(class2metrics.values()) if verbose: logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics))) self.model.average_weights() return None
def __init__(self, classes, tag_history): self.tag_history = tag_history self.classes = set(classes) self.class2model = {} for cls in classes: self.class2model[cls] = AveragedPerceptron() self.class2model[cls].classes = set( [self.NEGATIVE_CLASS, self.POSITIVE_CLASS])
def __init__(self, load=None): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() self.graphdict = pickle.load(open("../pos_dict.pickle", "rb")) self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb")) self.graph = nx.DiGraph() if load: self.load(load) with open("../gen_pos_graph.txt", "r") as pos_file: for line in pos_file: first, second = tuple(map(int, line.split())); if first >= len(self.nodeperm) or second >= len(self.nodeperm): continue first_idx = self.nodeperm[first] first_word = self.graphdict[first_idx] second_idx = self.nodeperm[second] second_word = self.graphdict[second_idx] self.graph.add_edge(first_word, second_word)
class PerceptronTagger(): '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ :param load: Load the pickled model upon instantiation. ''' START = ['-START-', '-START2-', '-START3-'] END = ['-END-', '-END2-', '-END3-'] # START = ['-START-', '-START2-'] # END = ['-END-', '-END2-'] def __init__(self, fname, load=True): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() self.model_file = fname if load: self.load(self.model_file) def tag(self, corpus, tokenise=False): '''Tags a string `corpus`.''' # Assume untokenised corpus has \n between sentences and ' ' between words #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n') #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split() reading = True sentence = [] line = corpus.readline() while reading: if line == '\n': # sentence boundary prev, prev2, prev3 = self.START # print('s:',sentence) for words in sentence: context = self.START + [ self._normalise(w[1]) for w in sentence ] + self.END for i, token in enumerate(sentence): tag = self.tagdict.get(token[1]) if not tag: # if the word isn't "unambiguous", extract features features = self._get_features( i, token[1], context, prev, prev2, prev3) # make the prediction tag = self.model.predict(features) sentence[i][3] = tag prev3 = prev2 prev2 = prev prev = tag # print out the tokens and their tags for words in sentence: print('\t'.join(words)) print() sentence = [] elif line == '': # we reached the end of the input reading = False elif line[0] == '#': # line is a comment line print(line.strip()) line = corpus.readline() continue else: # normal conllu line row = line.strip().split('\t') sentence.append(row) # read the next line line = corpus.readline() return def train(self, sentences, save_loc=None, nr_iter=5): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of 10-value tuples :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 # for words,tags in sentences: for sentence in sentences: # print(c, n, '|||', sentence); print(n, end='', file=sys.stderr) prev, prev2, prev3 = self.START context = self.START + [ self._normalise(w[1]) for w in sentence ] + self.END tags = [w[3] for w in sentence] for i, token in enumerate(sentence): word = token[1] guess = self.tagdict.get(word) if not guess: feats = self._get_features(i, word, context, prev, prev2, prev3) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev3 = prev2 prev2 = prev prev = guess c += guess == tags[i] n += 1 print('\r', end='', file=sys.stderr) random.shuffle(sentences) print() print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)), file=sys.stderr) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def load(self, loc): '''Load a pickled model.''' try: w_td_c = pickle.load(open(loc, 'rb')) except IOError: print("Missing " + loc + " file.") sys.exit(-1) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes return None def _normalise(self, word): '''Normalisation used in pre-processing. - All words are lower cased - Digits in the range 0000-2100 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str ''' if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2, prev3): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name, ) + tuple(args))] += 1 i += len(self.START) features = defaultdict(float) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i - 1]) add('i-1 suffix', context[i - 1][-3:]) add('i-2 word', context[i - 2]) add('i+1 word', context[i + 1]) add('i+1 suffix', context[i + 1][-3:]) add('i+2 word', context[i + 2]) # add some features @hw add('i-3 tag', prev3) add('i tag+i-2 tag+i-3 tag', prev, prev2, prev3) add('i-2 tag+i-1 tag+i word', prev, prev2, context[i]) add('i-3 tag+i-2 tag+i-1 tag+i word', prev, prev2, prev3, context[i]) add('i-2 word', context[i - 2]) add('i-3 word', context[i - 3]) add('i-1 suffix2', context[i - 1][-2:]) add('i-1 suffix1', context[i - 1][-1:]) add('i+1 suffix2', context[i + 1][-2:]) add('i+1 suffix1', context[i + 1][-1:]) # print(word, '|||', features) return features def _make_tagdict(self, sentences): '''Make a tag dictionary for single-tag words.''' counts = defaultdict(lambda: defaultdict(int)) # for words, tags in sentences: for sentence in sentences: for token in sentence: word = token[1] tag = token[3] counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag
class PerceptronTagger(BaseTagger): START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE) def __init__(self, load=None): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() self.graphdict = pickle.load(open("../pos_dict.pickle", "rb")) self.nodeperm = pickle.load(open("../pos_nodeperm_dict.pickle", "rb")) self.graph = nx.DiGraph() if load: self.load(load) with open("../gen_pos_graph.txt", "r") as pos_file: for line in pos_file: first, second = tuple(map(int, line.split())); if first >= len(self.nodeperm) or second >= len(self.nodeperm): continue first_idx = self.nodeperm[first] first_word = self.graphdict[first_idx] second_idx = self.nodeperm[second] second_word = self.graphdict[second_idx] self.graph.add_edge(first_word, second_word) def tag(self, corpus): prev, prev2 = self.START tokens = [] for sentence in corpus: context = self.START + [self._normalize(w) for w in sentence] + self.END for i, word in enumerate(sentence): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens def tag_graph(self, corpus): prev, prev2 = self.START tokens = [] for sentence in corpus: for i, word in enumerate(sentence): tag = self.tagdict.get(word) if not tag: features = self._get_features_graph(i, word, prev, self.graph) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens def tag_graph2(self, corpus): prev, prev2 = self.START tokens = [] for sentence in corpus: for i, word in enumerate(sentence): tag = self.tagdict.get(word) if not tag: features = self._get_features_graph(i, word, prev, self.graph) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens def tag_graph_deg(self, corpus): prev, prev2 = self.START tokens = [] for sentence in corpus: for i, word in enumerate(sentence): tag = self.tagdict.get(word) if not tag: features = self._get_features_graph(i, word, prev, self.graph) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens def tag_ngram(self, corpus): prev, prev2 = self.START tokens = [] for sentence in corpus: for i, word in enumerate(sentence): tag = self.tagdict.get(word) if not tag: features = self._get_features_ngram(i, word, prev) tag = self.model.predict(features) tokens.append((word, tag)) prev2 = prev prev = tag return tokens def train(self, sentences, save_loc=None, nr_iter=5): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 print "iteration: " ,iter_ for tups in sentences: words = map(operator.itemgetter(0), tups) tags = map(operator.itemgetter(1), tups) prev, prev2 = self.START #do this complicatedly context = self.START + [self._normalize(w) for w in words] \ + self.END for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: #this is the operant part feats = self._get_features(i, word, context, prev, prev2) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev2 = prev prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def train_ngram(self, sentences, save_loc=None, nr_iter=5): '''Train a model from graph, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param graph: a graph of the POSs which lead to each other. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 print "iteration: " ,iter_ for tups in sentences: if n % 1000 == 0: print "n : ", n words = map(operator.itemgetter(0), tups) tags = map(operator.itemgetter(1), tups) prev = self.START[0] for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features_ngram(i, word, prev) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def train_graph(self, sentences, save_loc=None, nr_iter=5): '''Train a model from graph, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param graph: a graph of the POSs which lead to each other. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 print "iteration: " ,iter_ for tups in sentences: if n % 1000 == 0: print "n : ", n words = map(operator.itemgetter(0), tups) tags = map(operator.itemgetter(1), tups) prev = self.START[0] for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features_graph(i, word, prev, self.graph) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def train_graph2(self, sentences, save_loc=None, nr_iter=5): '''Train a model from graph, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param graph: a graph of the POSs which lead to each other. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 print "iteration: " ,iter_ for tups in sentences: if n % 1000 == 0: print "n : ", n words = map(operator.itemgetter(0), tups) tags = map(operator.itemgetter(1), tups) prev = self.START[0] for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features_graph2(i, word, prev, self.graph) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def train_graph_deg(self, sentences, save_loc=None, nr_iter=5): '''Train a model from graph, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param graph: a graph of the POSs which lead to each other. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 print "iteration: " ,iter_ for tups in sentences: if n % 1000 == 0: print "n : ", n words = map(operator.itemgetter(0), tups) tags = map(operator.itemgetter(1), tups) prev = self.START[0] for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features_graph_deg(i, word, prev, self.graph) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) self.model.average_weights() # Pickle as a binary file if save_loc is not None: pickle.dump((self.model.weights, self.tagdict, self.classes), open(save_loc, 'wb'), -1) return None def load(self, loc): '''Load a pickled model.''' try: w_td_c = pickle.load(open(loc, 'rb')) except IOError: msg = ("Missing trontagger.pickle file.") raise MissingCorpusError(msg) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes return None def _normalize(self, word): '''Normalization used in pre-processing. - All words are lower cased - Digits in the range 1800-2100 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str ''' if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) return features def _get_features_graph(self, i, word, prev, graph): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) #get the i-1 tag from the graph #therefore, the graph should be a digraph #see how that performance works for parent, _ in graph.in_edges([prev]): add('i-1 tag parent', parent) return features def _get_features_graph2(self, i, word, prev, graph): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) #get the i-1 tag from the graph #therefore, the graph should be a digraph #see how that performance works for child, _ in graph.out_edges([prev]): add('i-1 tag children', child) return features def _get_features_graph_deg(self, i, word, prev, graph): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) #get the i-1 tag from the graph #therefore, the graph should be a digraph #see how that performance works if type(graph.degree(prev)) is int: add('i-1 tag degree', str(graph.degree(prev))) else: #is dict add('i-1 tag degree', str(graph.degree(prev).values())) return features def _get_features_ngram(self, i, word, prev): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i prev', prev) return features def _make_tagdict(self, sentences): '''Make a tag dictionary for single-tag words.''' counts = defaultdict(lambda: defaultdict(int)) for sentence in sentences: for word, tag in sentence: counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag
class PerceptronTaggerLabelPowerset(object): '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ :param load: Load the pickled model upon instantiation. ''' START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE) POSITIVE_CLASS = 1.0 NEGATIVE_CLASS = 0.0 def __init__(self, individual_tags, tag_history, combo_freq_threshold=1, tag_plus_word=0, tag_ngram_size=0): self.combo_freq_threshold = combo_freq_threshold self.classes = set() self.model = None self.individual_tags = set(individual_tags) self.tag_history = tag_history self.tag_plus_word = tag_plus_word self.tag_ngram_size = tag_ngram_size def _add_tag_features(self, wd, feats, prev_tags): for ix, prev in enumerate(prev_tags[-self.tag_history:]): offset = ix - self.tag_history feats["HIST_TAG " + str(offset) + " : " + str(prev)] = 1 for ix, prev in enumerate(prev_tags[-self.tag_plus_word:]): offset = ix - self.tag_history feats["[HIST_TAG | wd] " + str(offset) + " : " + str(prev) + "|" + wd] = 1 if self.tag_ngram_size > 0: tag_hist = prev_tags[-self.tag_ngram_size:] tag_ngram = "|".join(map(str, tag_hist)) feats["HIST_TAG_NGRAM: " + tag_ngram] = 1 def predict(self, essay_feats, output_scores = False): '''Tags a string `corpus`. Outputs a dictionary mapping to a list of binary predictions ''' # Assume untokenized corpus has \n between sentences and ' ' between words class2predictions = defaultdict(list) for essay_ix, essay in enumerate(essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ class2prev = defaultdict(list) for cls in self.classes: class2prev[cls] = list(self.START) prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary tagger_feats = dict(wd.features.items()) tagger_feats["bias"] = 1 # get all tagger predictions for previous 2 tags self._add_tag_features(wd.word, tagger_feats, prev) scores_by_class = self.model.decision_function(tagger_feats) guess = max(self.model.classes, key=lambda label: (scores_by_class[label], label)) prev.append(guess) if output_scores: max_score_per_class = defaultdict(float) for fset_tags, score in scores_by_class.items(): for tag in fset_tags: max_score_per_class[tag] = max(max_score_per_class[tag], score) for cls in self.individual_tags: class2predictions[cls].append(max_score_per_class[cls]) else: for cls in self.individual_tags: class2predictions[cls].append(1 if cls in guess else 0) np_class2predictions = dict() for key, lst in class2predictions.items(): np_class2predictions[key] = np.asarray(lst) return np_class2predictions def decision_function(self, essay_feats): '''Tags a string `corpus`. Outputs a dictionary mapping to a list of scores for each class ''' return self.predict(essay_feats, output_scores=True) def __get_tags_(self, tags): return frozenset((t for t in tags if t in self.individual_tags)) def train(self, essay_feats, nr_iter=5, verbose=True, average_weights=True): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. @param sentences: A list of (words, tags) tuples. @param nr_iter: Number of training iterations. @param verbose: Print learning progress ''' cp_essay_feats = list(essay_feats) if self.model == None: # Copy as we do an inplace shuffle below tag_freq = defaultdict(int) for essay in cp_essay_feats: for taggged_sentence in essay.sentences: for wd in taggged_sentence: fs_tags = self.__get_tags_(wd.tags) tag_freq[fs_tags] +=1 self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold]) self.model = AveragedPerceptron(self.classes) for iter_ in range(nr_iter): class2predictions = defaultdict(list) class2tags = defaultdict(list) for essay_ix, essay in enumerate(cp_essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary tagger_feats = dict(wd.features.items()) tagger_feats["bias"] = 1 # get all tagger predictions for previous 2 tags self._add_tag_features(wd.word, tagger_feats, prev) # add more in depth features for this tag actual = self.__get_tags_(wd.tags) guess = self.model.predict(tagger_feats) self.model.update(actual, guess, tagger_feats) prev.append(guess) for cls in self.individual_tags: class2predictions[cls].append( 1 if cls in guess else 0 ) class2tags[cls].append( 1 if cls in actual else 0) random.shuffle(cp_essay_feats) if verbose: class2metrics = ResultsProcessor.compute_metrics(class2tags, class2predictions) micro_metrics = micro_rpfa(class2metrics.values()) logging.info("Iter {0}: Micro Avg Metrics: {1}".format(iter_, str(micro_metrics))) if average_weights: self.model.average_weights() return None def _normalize(self, word): '''Normalization used in pre-processing. - All words are lower cased - Digits in the range 1800-2100 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str ''' if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i-1]) add('i-1 suffix', context[i-1][-3:]) add('i-2 word', context[i-2]) add('i+1 word', context[i+1]) add('i+1 suffix', context[i+1][-3:]) add('i+2 word', context[i+2]) return features
from classifier import BinaryClassifier from perceptron import Perceptron, AveragedPerceptron from naive_bayes import NaiveBayes from utils import read_data, build_vocab import utils from config import args if __name__ == '__main__': filepath = '../data/given/' build_vocab(filepath, vocab_size=args.vocab_size) train_data, test_data = read_data(filepath) perc_classifier = Perceptron(args) perc_classifier.fit(train_data) acc, prec, rec, f1 = perc_classifier.evaluate(test_data) print('Perceptron Results:') print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1)) avg_perc_classifier = AveragedPerceptron(args) avg_perc_classifier.fit(train_data) acc, prec, rec, f1 = avg_perc_classifier.evaluate(test_data) print('\nAveraged Perceptron Results:') print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1)) nb_classifier = NaiveBayes(args) nb_classifier.fit(train_data) acc, prec, rec, f1 = nb_classifier.evaluate(test_data) print('\nNaive Bayes Performance:') print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1))
class PerceptronTaggerMultiClassCombo(object): '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ :param load: Load the pickled model upon instantiation. ''' START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE) POSITIVE_CLASS = 1.0 NEGATIVE_CLASS = 0.0 def __init__(self, individual_tags, tag_history, combo_freq_threshold, load=False, use_tag_features=True): self.use_tag_features = use_tag_features self.combo_freq_threshold = combo_freq_threshold self.tag_history = tag_history self.classes = set() self.individual_tags = set(individual_tags) def _add_tag_features(self, feats, word, prev, prev2): sprev, sprev2 = str(prev), str(prev2) feats["bias"] = 1 # Commenting out the single previous tag features as included with the # tag history parameter #feats["TAG -1 " + sprev] = 1 # included in other feats["TAG -1 wd " + sprev + "|" + word] = 1 #feats["TAG -2 " + sprev2] = 1 # included in other feats["TAG -2 wd " + sprev2 + "|" + word] = 1 feats["TAG -1, -2 " + sprev + "|" + sprev2] = 1 def _add_secondary_tag_features(self, feats, prev_tags): for ix, prev in enumerate(prev_tags[-self.tag_history:]): offset = ix - self.tag_history feats["HIST_TAG " + str(offset) + " " + str(prev)] = 1 def predict(self, essay_feats, output_scores=False): '''Tags a string `corpus`. Outputs a dictionary mapping to a list of binary predictions ''' # Assume untokenized corpus has \n between sentences and ' ' between words class2predictions = defaultdict(list) for essay_ix, essay in enumerate(essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ class2prev = defaultdict(list) for cls in self.classes: class2prev[cls] = list(self.START) prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary shared_features = dict(wd.features.items()) # get all tagger predictions for previous 2 tags self._add_secondary_tag_features(shared_features, prev) tagger_feats = dict(shared_features.items()) if self.use_tag_features: self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2]) scores_by_class = self.model.decision_function( tagger_feats) guess = max(self.model.classes, key=lambda label: (scores_by_class[label], label)) prev.append(guess) if output_scores: max_score_per_class = defaultdict(float) for fset_tags, score in scores_by_class.items(): for tag in fset_tags: max_score_per_class[tag] = max( max_score_per_class[tag], score) for cls in self.individual_tags: class2predictions[cls].append( max_score_per_class[cls]) else: for cls in self.individual_tags: class2predictions[cls].append(1 if cls in guess else 0) np_class2predictions = dict() for key, lst in class2predictions.items(): np_class2predictions[key] = np.asarray(lst) return np_class2predictions def decision_function(self, essay_feats): '''Tags a string `corpus`. Outputs a dictionary mapping to a list of scores for each class ''' return self.predict(essay_feats, output_scores=True) def __get_tags_(self, tags): return frozenset((t for t in tags if t in self.individual_tags)) def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' cp_essay_feats = list(essay_feats) # Copy as we do an inplace shuffle below tag_freq = defaultdict(int) for essay in cp_essay_feats: for taggged_sentence in essay.sentences: for wd in taggged_sentence: fs_tags = self.__get_tags_(wd.tags) tag_freq[fs_tags] += 1 self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold ]) self.model = AveragedPerceptron(self.classes) for iter_ in range(nr_iter): class2predictions = defaultdict(list) class2tags = defaultdict(list) for essay_ix, essay in enumerate(cp_essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary shared_features = dict(wd.features.items()) # get all tagger predictions for previous 2 tags self._add_secondary_tag_features(shared_features, prev) tagger_feats = dict(shared_features.items()) # add more in depth features for this tag actual = self.__get_tags_(wd.tags) if self.use_tag_features: self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2]) guess = self.model.predict(tagger_feats) self.model.update(actual, guess, tagger_feats) prev.append(guess) for cls in self.individual_tags: class2predictions[cls].append(1 if cls in guess else 0) class2tags[cls].append(1 if cls in actual else 0) random.shuffle(cp_essay_feats) class2metrics = ResultsProcessor.compute_metrics( class2tags, class2predictions) micro_metrics = micro_rpfa(class2metrics.values()) if verbose: logging.info("Iter {0}: Micro Avg Metrics: {1}".format( iter_, str(micro_metrics))) self.model.average_weights() return None def _normalize(self, word): '''Normalization used in pre-processing. - All words are lower cased - Digits in the range 1800-2100 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str ''' if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name, ) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i - 1]) add('i-1 suffix', context[i - 1][-3:]) add('i-2 word', context[i - 2]) add('i+1 word', context[i + 1]) add('i+1 suffix', context[i + 1][-3:]) add('i+2 word', context[i + 2]) return features
def train(self, essay_feats, save_loc=None, nr_iter=5, verbose=True): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' cp_essay_feats = list(essay_feats) # Copy as we do an inplace shuffle below tag_freq = defaultdict(int) for essay in cp_essay_feats: for taggged_sentence in essay.sentences: for wd in taggged_sentence: fs_tags = self.__get_tags_(wd.tags) tag_freq[fs_tags] += 1 self.classes = set([ fs for fs, cnt in tag_freq.items() if cnt >= self.combo_freq_threshold ]) self.model = AveragedPerceptron(self.classes) for iter_ in range(nr_iter): class2predictions = defaultdict(list) class2tags = defaultdict(list) for essay_ix, essay in enumerate(cp_essay_feats): for sent_ix, taggged_sentence in enumerate(essay.sentences): """ Start Sentence """ prev = list(self.START) for i, (wd) in enumerate(taggged_sentence): # Don't mutate the feat dictionary shared_features = dict(wd.features.items()) # get all tagger predictions for previous 2 tags self._add_secondary_tag_features(shared_features, prev) tagger_feats = dict(shared_features.items()) # add more in depth features for this tag actual = self.__get_tags_(wd.tags) if self.use_tag_features: self._add_tag_features(tagger_feats, wd.word, prev[-1], prev[-2]) guess = self.model.predict(tagger_feats) self.model.update(actual, guess, tagger_feats) prev.append(guess) for cls in self.individual_tags: class2predictions[cls].append(1 if cls in guess else 0) class2tags[cls].append(1 if cls in actual else 0) random.shuffle(cp_essay_feats) class2metrics = ResultsProcessor.compute_metrics( class2tags, class2predictions) micro_metrics = micro_rpfa(class2metrics.values()) if verbose: logging.info("Iter {0}: Micro Avg Metrics: {1}".format( iter_, str(micro_metrics))) self.model.average_weights() return None