def cluster(self, tokens, assign_clusters=False, trace=False): assert chktype(1, tokens, [Token]) assert chktype(2, assign_clusters, bool) assert chktype(3, trace, bool) assert len(tokens) > 0 vectors = map(lambda tk: tk['FEATURES'], tokens) # normalise the vectors if self._should_normalise: vectors = map(self._normalise, vectors) # use SVD to reduce the dimensionality if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): [u, d, vt] = LinearAlgebra.singular_value_decomposition( Numeric.transpose(Numeric.array(vectors))) S = d[:self._svd_dimensions] * \ Numeric.identity(self._svd_dimensions, Numeric.Float64) T = u[:, :self._svd_dimensions] Dt = vt[:self._svd_dimensions, :] vectors = Numeric.transpose(Numeric.matrixmultiply(S, Dt)) self._Tt = Numeric.transpose(T) # call abstract method to cluster the vectors self.cluster_vectorspace(vectors, trace) # assign the tokens to clusters if assign_clusters: for token in tokens: self.classify(token)
def cluster(self, tokens, assign_clusters=False, trace=False): assert chktype(1, tokens, [Token]) assert chktype(2, assign_clusters, bool) assert chktype(3, trace, bool) assert len(tokens) > 0 vectors = map(lambda tk: tk['FEATURES'], tokens) # normalise the vectors if self._should_normalise: vectors = map(self._normalise, vectors) # use SVD to reduce the dimensionality if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): [u, d, vt] = LinearAlgebra.singular_value_decomposition( Numeric.transpose(Numeric.array(vectors))) S = d[:self._svd_dimensions] * \ Numeric.identity(self._svd_dimensions, Numeric.Float64) T = u[:,:self._svd_dimensions] Dt = vt[:self._svd_dimensions,:] vectors = Numeric.transpose(Numeric.matrixmultiply(S, Dt)) self._Tt = Numeric.transpose(T) # call abstract method to cluster the vectors self.cluster_vectorspace(vectors, trace) # assign the tokens to clusters if assign_clusters: for token in tokens: self.classify(token)
def pp(self, margin=70, indent=0, nodesep=':', parens='()'): """ @return: A pretty-printed string representation of this tree. @rtype: C{string} @param margin: The right margin at which to do line-wrapping. @type margin: C{int} @param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. @type indent: C{int} @param nodesep: A string that is used to separate the node from the children. E.g., the default value C{':'} gives trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. """ assert chktype(1, margin, types.IntType) assert chktype(2, indent, types.IntType) # Try writing it on one line. s = self._ppflat(nodesep, parens) if len(s)+indent < margin: return s # If it doesn't fit on one line, then write it on multi-lines. s = '%s%s%s' % (parens[0], self.node, nodesep) for child in self: if isinstance(child, Tree): s += '\n'+' '*(indent+2)+child.pp(margin, indent+2, nodesep, parens) else: s += '\n'+' '*(indent+2)+repr(child) return s+parens[1]
def setdefault(self, property, default=None): assert chktype(1, property, str) assert chktype(2, default, self._checkval) if ((property == 'LOC') and not isinstance(default, LocationI) and default is not None): raise TypeError("The 'LOC' property must contain a Location") return super(SafeToken, self).setdefault(property, default)
def __setitem__(self, property, value): assert chktype(1, property, str) assert chktype(2, value, self._checkval) if ((property == 'LOC') and not isinstance(value, LocationI) and value is not None): raise TypeError("The 'LOC' property must contain a Location") return super(SafeToken, self).__setitem__(property, value)
def __init__(self, states=None, symbols=None, **properties): """ Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. A supervised and unsupervised training method may be used. If either of the states or symbols are not given, these may be derived from supervised training. @param states: the set of state labels @type states: sequence of any @param symbols: the set of observation symbols @type symbols: sequence of any @param properties: alternative names to be used for the TEXT, SUBTOKENS and TAG properties """ assert chktype(1, symbols, types.TupleType, types.ListType, types.NoneType) assert chktype(2, states, types.TupleType, types.ListType, types.NoneType) if states: self._states = states else: self._states = [] if symbols: self._symbols = symbols else: self._symbols = [] self._properties = properties
def __init__(self, symbols, states, transitions, outputs, priors, **properties): """ Creates a hidden markov model parametised by the the states, transition probabilities, output probabilities and priors. @param symbols: the set of output symbols (alphabet) @type symbols: (seq) of any @param states: a set of states representing state space @type states: seq of any @param transitions: transition probabilities; Pr(s_i | s_j) is the probability of transition from state i given the model is in state_j @type transitions: C{ConditionalProbDistI} @param outputs: output probabilities; Pr(o_k | s_i) is the probability of emitting symbol k when entering state i @type outputs: C{ConditionalProbDistI} @param priors: initial state distribution; Pr(s_i) is the probability of starting in state i @type priors: C{ProbDistI} @param properties: property names: TAG, TEXT, and SUBTOKENS are used and may be overridden @type properties: C{dict} """ assert chktype(1, symbols, types.TupleType, types.ListType) assert chktype(2, states, types.TupleType, types.ListType) assert chktype(3, transitions, ConditionalProbDistI) # assert chktype(4, outputs, ConditionalProbDistI) assert chktype(5, priors, ProbDistI) self._states = states self._transitions = transitions self._symbols = symbols self._outputs = outputs self._priors = priors self._properties = properties
def pp(self, margin=70, indent=0, nodesep=':', parens='()'): """ @return: A pretty-printed string representation of this tree. @rtype: C{string} @param margin: The right margin at which to do line-wrapping. @type margin: C{int} @param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. @type indent: C{int} @param nodesep: A string that is used to separate the node from the children. E.g., the default value C{':'} gives trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. """ assert chktype(1, margin, types.IntType) assert chktype(2, indent, types.IntType) # Try writing it on one line. s = self._ppflat(nodesep, parens) if len(s) + indent < margin: return s # If it doesn't fit on one line, then write it on multi-lines. s = '%s%s%s' % (parens[0], self.node, nodesep) for child in self: if isinstance(child, Tree): s += '\n' + ' ' * (indent + 2) + child.pp( margin, indent + 2, nodesep, parens) else: s += '\n' + ' ' * (indent + 2) + repr(child) return s + parens[1]
def tagger_accuracy(tagger, gold_standard): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. @type tagger: C{Tagger} @param tagger: The tagger being evaluated. @type gold_standard: C{list} of C{Token} @param gold_standard: The list of tagged tokens to score the tagger on; each must have the 'SUBTOKENS' attribute. @rtype: C{float} """ # NB: replace tagger._property_names with tagger.property_names()? assert chktype(1, tagger, TaggerI) assert chktype(2, gold_standard, (Token,), [Token]) TAG = tagger.property('TAG') SUBTOKENS = tagger.property('SUBTOKENS') gold_toks = [] test_toks = [] for gold_doc in gold_standard: test_doc = gold_doc.exclude(TAG) tagger.tag(test_doc) gold_toks += gold_doc[SUBTOKENS] test_toks += test_doc[SUBTOKENS] return accuracy(gold_toks, test_toks)
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): assert chktype(1, num_clusters, int) assert chktype(2, normalise, bool) assert chktype(3, svd_dimensions, int, types.NoneType) VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None
def __init__(self, normalise=False, svd_dimensions=None): """ @param normalise: should vectors be normalised to length 1 @type normalise: boolean @param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD @type svd_dimensions: int """ assert chktype(1, normalise, bool) assert chktype(2, svd_dimensions, int, types.NoneType) self._Tt = None self._should_normalise = normalise self._svd_dimensions = svd_dimensions
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModel @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: Token @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ assert chktype(1, labelled_sequences, Token) # grab the property names used TEXT = self._properties.get("TEXT", "TEXT") TAG = self._properties.get("TAG", "TAG") SUBTOKENS = self._properties.get("SUBTOKENS", "SUBTOKENS") # default to the MLE estimate estimator = kwargs.get("estimator") if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for super_token in labelled_sequences[SUBTOKENS]: lasts = None for token in super_token[SUBTOKENS]: state = token[TAG] symbol = token[TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModel(self._symbols, self._states, A, B, pi, **self._properties)
def tokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property("TEXT") LOC = self.property("LOC") SUBTOKENS = self.property("SUBTOKENS") # If we're not adding locations, then just delegate to # raw_tokenize. if not add_locs: self._tokenize_from_raw(token, add_locs, add_contexts) return # This split will return a list of alternating matches and # non-matches. If negative=1, then we want the even elements; # if negative=0, then we want the odd elements. words = self._regexp.split(token[TEXT]) # Get the input token's source and start position. if token.has(LOC): source = token[LOC].source() pos = token[LOC].start() else: source = None pos = 0 # Generate a list of subtokens with locations. subtoks = [] for i, w in enumerate(words): if (i % 2 == 0) == self._negative and w != "": loc = CharSpanLocation(pos, pos + len(w), source) subtoks.append(Token({TEXT: w, LOC: loc})) pos += len(w) # Write subtoks to the SUBTOKENS property. token[SUBTOKENS] = subtoks
def __init__(self, n, reverse=False, cutoff=0, **property_names): """ Construct a new I{n}-th order stochastic tagger. The new tagger should be trained, using the L{train()} method, before it is used to tag data. @param n: The order of the new C{NthOrderTagger}. @type n: int @param reverse: If true, then assign tags to subtokens in reverse sequential order (i.e., from last to first). @type cutoff: C{int} @param cutoff: A count-cutoff for the tagger's frequency distribution. If the tagger saw fewer than C{cutoff} examples of a given context in training, then it will return a tag of C{None} for that context. @type property_names: C{dict} @param property_names: A dictionary that can be used to override the default property names. Each entry maps from a default property name to a new property name. """ assert chktype(1, n, types.IntType) if n < 0: raise ValueError('n must be non-negative') SequentialTagger.__init__(self, reverse, **property_names) self._freqdist = ConditionalFreqDist() self._n = n self._cutoff = cutoff # Record the start & end indices of the context window for # tags. if self._reverse: self._left = 1 self._right = 1+n else: self._left = -n self._right = 0
def __init__(self, states=None, symbols=None, **properties): """ Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. Only a supervised training method may be used. """ assert chktype(1,symbols,types.TupleType,types.ListType,types.NoneType) assert chktype(2,states,types.TupleType,types.ListType,types.NoneType) if states: self._states = states else: self._states = [] if symbols: self._symbols = symbols else: self._symbols = [] self._properties = properties
def raw_tokenize(self, text): assert chktype(1, text, str) TEXT = self.property("TEXT") SUBTOKENS = self.property("SUBTOKENS") token = Token({TEXT: text}) self.tokenize(token) return [subtok[TEXT] for subtok in token[SUBTOKENS]]
def _centroid(self, cluster): assert chktype(1, cluster, []) assert len(cluster) > 0 centroid = copy.copy(cluster[0]) for vector in cluster[1:]: centroid += vector return centroid / float(len(cluster))
def tokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property('TEXT') LOC = self.property('LOC') SUBTOKENS = self.property('SUBTOKENS') # If we're not adding locations, then just delegate to # raw_tokenize. if not add_locs: self._tokenize_from_raw(token, add_locs, add_contexts) return # This split will return a list of alternating matches and # non-matches. If negative=1, then we want the even elements; # if negative=0, then we want the odd elements. words = self._regexp.split(token[TEXT]) # Get the input token's source and start position. if token.has(LOC): source = token[LOC].source() pos = token[LOC].start() else: source = None pos = 0 # Generate a list of subtokens with locations. subtoks = [] for i, w in enumerate(words): if (i % 2 == 0) == self._negative and w != '': loc = CharSpanLocation(pos, pos + len(w), source) subtoks.append(Token({TEXT: w, LOC: loc})) pos += len(w) # Write subtoks to the SUBTOKENS property. token[SUBTOKENS] = subtoks
def read_token(self, s, add_contexts=False, add_locs=False, source=None): assert chktype(1, s, str) TEXT = self.property('TEXT') LOC = self.property('LOC') CONTEXT = self.property('CONTEXT') SENTS = self.property('SENTS') TREE = self.property('TREE') sentences = re.findall('(?s)\S.*?/\.', s) sent_toks = [] for sent_num, sentence in enumerate(sentences): sent_loc = SentIndexLocation(sent_num, source) sent_tok = self._sent_reader.read_token( sentence, add_contexts=add_contexts, add_locs=add_locs, source=sent_loc) sent_toks.append(sent_tok) tok = Token(**{SENTS: sent_toks}) # Add context pointers, if requested if add_contexts: for i, sent_tok in enumerate(tok[SENTS]): sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i) # Return the finished token. return tok
def probability(self, sequence): """ Returns the probability of the given symbol sequence. If the sequence is labelled, then returns the joint probability of the symbol, state sequence. Otherwise, uses the forward algorithm to find the probability over all label sequences. @return: the probability of the sequence @rtype: float @param sequence: the sequence of symbols which must contain the TEXT property, and optionally the TAG property @type sequence: Token """ assert chktype(1, sequence, Token) SUBTOKENS = self._properties.get("SUBTOKENS", "SUBTOKENS") TEXT = self._properties.get("TEXT", "TEXT") TAG = self._properties.get("TAG", "TAG") symbols = sequence[SUBTOKENS] T = len(symbols) N = len(self._states) if T > 0 and symbols[0].has(TAG): last_state = symbols[0][TAG] p = self._priors.logprob(last_state) + self._outputs[last_state].logprob(symbols[0][TEXT]) for t in range(1, T): state = symbols[t][TAG] p += self._transitions[last_state].logprob(state) + self._outputs[state].logprob(symbols[t][TEXT]) return exp(p) else: alpha = self._forward_probability(sequence) p = _log_add(*alpha[T - 1, :]) return exp(p)
def train(self, tagged_token): """ Train this C{NthOrderTagger} using the given training data. If this method is called multiple times, then the training data will be combined. @param tagged_token: A tagged corpus. Each subtoken in C{tagged_token} should define the C{text} and C{tag} properties. @type tagged_token: L{Token} """ assert chktype(1, tagged_token, Token) SUBTOKENS = self.property('SUBTOKENS') TEXT = self.property('TEXT') TAG = self.property('TAG') left, right = self._left, self._right # Extract the list of subtokens & list of tags. subtokens = tagged_token[SUBTOKENS] tags = tuple([t[TAG] for t in subtokens]) for i, subtok in enumerate(subtokens): if i+left<0: continue # Construct the context from the current subtoken's text # and the adjacent tokens' tags. context = (tags[i+left:i+right], subtok[TEXT]) # Record the current token in the frequency distribution. tag = subtok[TAG] self._freqdist[context].inc(tag)
def read_token(self, s, add_contexts=False, add_locs=False, source=None): assert chktype(1, s, str) TEXT = self.property('TEXT') LOC = self.property('LOC') CONTEXT = self.property('CONTEXT') SENTS = self.property('SENTS') TREE = self.property('TREE') sentences = re.findall('(?s)\S.*?/\.', s) sent_toks = [] for sent_num, sentence in enumerate(sentences): sent_loc = SentIndexLocation(sent_num, source) sent_tok = self._sent_reader.read_token(sentence, add_contexts=add_contexts, add_locs=add_locs, source=sent_loc) sent_toks.append(sent_tok) tok = Token(**{SENTS: sent_toks}) # Add context pointers, if requested if add_contexts: for i, sent_tok in enumerate(tok[SENTS]): sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i) # Return the finished token. return tok
def best_path(self, unlabelled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. @return: the state sequence @rtype: sequence of any @param unlabelled_sequence: the sequence of unlabelled symbols @type unlabelled_sequence: Token """ assert chktype(1, unlabelled_sequence, Token) SUBTOKENS = "SUBTOKENS" TEXT = "TEXT" symbols = unlabelled_sequence[SUBTOKENS] T = len(symbols) N = len(self._states) V = zeros((T, N), Float64) B = {} # find the starting log probabilities for each state symbol = symbols[0][TEXT] for i in range(N): state = self._states[i] V[0, i] = self._priors.logprob(state) + self._output_logprob(state, symbol) B[0, state] = None # find the maximum log probabilities for reaching each state at time t for t in range(1, T): symbol = symbols[t][TEXT] for j in range(N): sj = self._states[j] best = None for i in range(N): si = self._states[i] va = V[t - 1, i] + self._transitions[si].logprob(sj) if not best or va > best[0]: best = (va, si) V[t, j] = best[0] + self._output_logprob(sj, symbol) B[t, sj] = best[1] # find the highest probability final state best = None for i in range(N): val = V[T - 1, i] if not best or val > best[0]: best = (val, self._states[i]) # traverse the back-pointers B to find the state sequence current = best[1] sequence = [current] for t in range(T - 1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return sequence
def raw_tokenize(self, text): assert chktype(1, text, str) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') token = Token({TEXT: text}) self.tokenize(token) return [subtok[TEXT] for subtok in token[SUBTOKENS]]
def __init__(self, num_means, distance, repeats=1, conv_test=1e-6, initial_means=None, normalise=False, svd_dimensions=None, rng=None): """ @param num_means: the number of means to use (may use fewer) @type num_means: int @param distance: measure of distance between two vectors @type distance: function taking two vectors and returing a float @param repeats: number of randomised clustering trials to use @type repeats: int @param conv_test: maximum variation in mean differences before deemed convergent @type conv_test: number @param initial_means: set of k initial means @type initial_means: sequence of vectors @param normalise: should vectors be normalised to length 1 @type normalise: boolean @param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD @type svd_dimensions: int @param rng: random number generator (or None) @type rng: Random """ assert chktype(1, num_means, int) #assert chktype(2, distance, ...) assert chktype(3, repeats, int) assert chktype(4, conv_test, int, float) #assert chktype(5, initial_means, [Numeric.array([])], [SparseArray]) assert chktype(6, normalise, bool) assert chktype(7, svd_dimensions, int, types.NoneType) VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_means = num_means self._distance = distance self._max_difference = conv_test assert not initial_means or len(initial_means) == num_means self._means = initial_means assert repeats >= 1 assert not (initial_means and repeats > 1) self._repeats = repeats if rng: self._rng = rng else: self._rng = random.Random()
def raw_xtokenize(self, text): assert chktype(1, text, str) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') token = Token({TEXT: text}) self.xtokenize(token) for subtok in token[SUBTOKENS]: yield subtok[TEXT]
def raw_xtokenize(self, text): assert chktype(1, text, str) TEXT = self.property("TEXT") SUBTOKENS = self.property("SUBTOKENS") token = Token({TEXT: text}) self.xtokenize(token) for subtok in token[SUBTOKENS]: yield subtok[TEXT]
def __init__(self, states=None, symbols=None, **properties): """ Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. Only a supervised training method may be used. """ assert chktype(1, symbols, types.TupleType, types.ListType, types.NoneType) assert chktype(2, states, types.TupleType, types.ListType, types.NoneType) if states: self._states = states else: self._states = [] if symbols: self._symbols = symbols else: self._symbols = [] self._properties = properties
def likelihood(self, labelled_token): assert chktype(1, labelled_token, Token) vector = labelled_token['FEATURES'] #assert chktype('features', vector, Numeric.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = Numeric.matrixmultiply(self._Tt, vector) return self.likelihood_vectorspace(vector, labelled_token['CLUSTER'])
def __init__(self, items=[]): """ @param items: the items at the leaves of the dendogram @type items: sequence of (any) """ assert chktype(1, items, []) self._items = [_DendogramNode(item) for item in items] self._original_items = copy.copy(self._items) self._merge = 1
def xtokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property("TEXT") SUBTOKENS = self.property("SUBTOKENS") text = token[TEXT] if hasattr(text, "__iter__") and hasattr(text, "next"): token[TEXT] = "".join(text) self.tokenize(token, add_locs, add_contexts) token[SUBTOKENS] = iter(token[SUBTOKENS])
def xtokenize(self, token, add_locs=False, add_contexts=False): assert chktype(1, token, Token) TEXT = self.property('TEXT') SUBTOKENS = self.property('SUBTOKENS') text = token[TEXT] if hasattr(text, '__iter__') and hasattr(text, 'next'): token[TEXT] = ''.join(text) self.tokenize(token, add_locs, add_contexts) token[SUBTOKENS] = iter(token[SUBTOKENS])
def __init__(self, reference, test): """ Construct a new confusion matrix from a list of reference values and a corresponding list of test values. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError('Lists must have the same length.') # Get a list of all values. values = dict([(val, 1) for val in reference + test]).keys() # Construct a value->index dictionary indices = dict([(val, i) for (i, val) in enumerate(values)]) # Make a confusion matrix table. confusion = [[0 for val in values] for val in values] max_conf = 0 # Maximum confusion for w, g in zip(reference, test): confusion[indices[w]][indices[g]] += 1 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) #: A list of all values in C{reference} or C{test}. self._values = values #: A dictionary mapping values in L{self._values} to their indices. self._indices = indices #: The confusion matrix itself (as a list of lists of counts). self._confusion = confusion #: The greatest count in L{self._confusion} (used for printing). self._max_conf = 0 #: The total number of values in the confusion matrix. self._total = len(reference) #: The number of correct (on-diagonal) values in the matrix. self._correct = sum([confusion[i][i] for i in range(len(values))])
def __init__(self, reference, test): """ Construct a new confusion matrix from a list of reference values and a corresponding list of test values. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError('Lists must have the same length.') # Get a list of all values. values = dict([(val,1) for val in reference+test]).keys() # Construct a value->index dictionary indices = dict([(val,i) for (i,val) in enumerate(values)]) # Make a confusion matrix table. confusion = [[0 for val in values] for val in values] max_conf = 0 # Maximum confusion for w,g in zip(reference, test): confusion[indices[w]][indices[g]] += 1 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) #: A list of all values in C{reference} or C{test}. self._values = values #: A dictionary mapping values in L{self._values} to their indices. self._indices = indices #: The confusion matrix itself (as a list of lists of counts). self._confusion = confusion #: The greatest count in L{self._confusion} (used for printing). self._max_conf = 0 #: The total number of values in the confusion matrix. self._total = len(reference) #: The number of correct (on-diagonal) values in the matrix. self._correct = sum([confusion[i][i] for i in range(len(values))])
def recall(reference, test): """ Given a set of reference values and a set of test values, return the percentage of reference values that appear in the test set. In particular, return |C{reference}S{cap}C{test}|/|C{reference}|. If C{reference} is empty, then return C{None}. @type reference: C{Set} @param reference: A set of reference values. @type test: C{Set} @param test: A set of values to compare against the reference set. @rtype: C{float} or C{None} """ assert chktype(1, reference, sets.BaseSet) assert chktype(2, test, sets.BaseSet) if len(reference) == 0: return None else: return float(len(reference.intersection(test))) / len(reference)
def classify(self, token): assert chktype(1, token, Token) vector = token['FEATURES'] #assert chktype('features', vector, Numeric.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = Numeric.matrixmultiply(self._Tt, vector) cluster = self.classify_vectorspace(vector) token['CLUSTER'] = self.cluster_name(cluster)
def tag(self, token): assert chktype(1, token, Token) SUBTOKENS = self.property('SUBTOKENS') TAG = self.property('TAG') # Tag each token, in sequential order. subtokens = token[SUBTOKENS] for i, subtoken in enumerate(subtokens): tag = self.tag_subtoken(subtokens, i) subtoken[TAG] = tag
def recall(reference, test): """ Given a set of reference values and a set of test values, return the percentage of reference values that appear in the test set. In particular, return |C{reference}S{cap}C{test}|/|C{reference}|. If C{reference} is empty, then return C{None}. @type reference: C{Set} @param reference: A set of reference values. @type test: C{Set} @param test: A set of values to compare against the reference set. @rtype: C{float} or C{None} """ assert chktype(1, reference, sets.BaseSet) assert chktype(2, test, sets.BaseSet) if len(reference) == 0: return None else: return float(len(reference.intersection(test)))/len(reference)
def __init__(self, initial_means, priors=None, covariance_matrices=None, conv_threshold=1e-6, bias=0.1, normalise=False, svd_dimensions=None): """ Creates an EM clusterer with the given starting parameters, convergence threshold and vector mangling parameters. @param initial_means: the means of the gaussian cluster centers @type initial_means: [seq of] Numeric array or seq of SparseArray @param priors: the prior probability for each cluster @type priors: Numeric array or seq of float @param covariance_matrices: the covariance matrix for each cluster @type covariance_matrices: [seq of] Numeric array @param conv_threshold: maximum change in likelihood before deemed convergent @type conv_threshold: int or float @param bias: variance bias used to ensure non-singular covariance matrices @type bias: float @param normalise: should vectors be normalised to length 1 @type normalise: boolean @param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD @type svd_dimensions: int """ #assert chktype(1, initial_means, []) #assert chktype(2, priors, [], types.NoneType) #assert chktype(3, covariance_matrices, [], types.NoneType) assert chktype(4, conv_threshold, float, int) assert chktype(6, normalise, bool) assert chktype(7, svd_dimensions, int, types.NoneType) VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._means = Numeric.array(initial_means, Numeric.Float64) self._num_clusters = len(initial_means) self._conv_threshold = conv_threshold self._covariance_matrices = covariance_matrices self._priors = priors self._bias = bias
def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the percentage of corresponding values that are equal. In particular, return the percentage of indices C{0<i<=len(test)} such that C{test[i] == reference[i]}. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError("Lists must have the same length.") num_correct = [1 for x,y in zip(reference, test) if x==y] return float(len(num_correct)) / len(reference)
def groups(self, n): """ Finds the n-groups of items (leaves) reachable from a cut at depth n. @param n: number of groups @type n: int """ assert chktype(1, n, int) if len(self._items) > 1: root = _DendogramNode(self._merge, *self._items) else: root = self._items[0] return root.groups(n)
def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the percentage of corresponding values that are equal. In particular, return the percentage of indices C{0<i<=len(test)} such that C{test[i] == reference[i]}. @type reference: C{list} @param reference: An ordered list of reference values. @type test: C{list} @param test: A list of values to compare against the corresponding reference values. @raise ValueError: If C{reference} and C{length} do not have the same length. """ assert chktype(1, reference, []) assert chktype(2, test, []) if len(reference) != len(test): raise ValueError("Lists must have the same length.") num_correct = [1 for x, y in zip(reference, test) if x == y] return float(len(num_correct)) / len(reference)
def likelihood(self, labelled_token): """ Returns the likelihood (a float) of the token having the corresponding cluster. """ assert chktype(1, labelled_token, Token) assert labelled_token.has('CLUSTER') token = labelled_token.exclude('CLUSTER') self.classify(token) if token == labelled_token: return 1.0 else: return 0.0
def vector(self, token): """ Returns the vector after normalisation and dimensionality reduction for the given token's FEATURES. """ assert chktype(1, token, Token) vector = token['FEATURES'] #assert chktype('features', vector, Numeric.array([]), SparseArray) if self._should_normalise: vector = self._normalise(vector) if self._Tt != None: vector = Numeric.matrixmultiply(self._Tt, vector) return vector
def copy(self, deep=True): """ @rtype: L{Token} @return: A new copy of this token. @param deep: If false, then the new token will use the same objects to encode feature values that the original token did. If true, then the new token will use deep copies of the original token's feature values. The default value of C{True} is almost always the correct choice. """ assert chktype(1, deep, bool) if deep: return copy.deepcopy(self) else: return copy.copy(self)