def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) for j, nj in compat.iteritems(label_freqs): for l, nl in compat.iteritems(label_freqs): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret
def svm_label_name(self, label): """ searches values of _labelmapping to resolve +1 or -1 to a string :param label: the string label to look up """ labelname = [k for k, v in compat.iteritems(self._labelmapping) if v == label][0] return labelname
def _apply_filter(self, fn=lambda ngram, freq: False): """Generic filter removes ngrams from the frequency distribution if the function returns True when passed an ngram tuple. """ tmp_ngram = FreqDist() for ngram, freq in iteritems(self.ngram_fd): if not fn(ngram, freq): tmp_ngram[ngram] = freq self.ngram_fd = tmp_ngram
def svm_label_name(self, label): """ searches values of _labelmapping to resolve +1 or -1 to a string :param label: the string label to look up """ labelname = [ k for k, v in compat.iteritems(self._labelmapping) if v == label ][0] return labelname
def pi(self): """Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988). """ total = 0.0 label_freqs = FreqDist(x['labels'] for x in self.data) for k, f in compat.iteritems(label_freqs): total += f ** 2 Ae = total / float((len(self.I) * len(self.C)) ** 2) return (self.avg_Ao() - Ae) / (1 - Ae)
def _featuresets_to_array(self, featuresets): """Convert featureset to NumPy array.""" X = np.zeros((len(featuresets), len(self._feature_index)), dtype=self._dtype) for i, fs in enumerate(featuresets): for f, v in compat.iteritems(fs): try: X[i, self._feature_index[f]] = self._dtype(v) except KeyError: # feature not seen in training pass return X
def train(featuresets): """ given a set of training instances in nltk format: [ ( {feature:value, ..}, str(label) ) ] train a support vector machine :param featuresets: training instances """ _raise_if_svmlight_is_missing() # build a unique list of labels labels = set() for (features, label) in featuresets: labels.add(label) # this is a binary classifier only if len(labels) > 2: raise ValueError('Can only do boolean classification (labels: ' + str(labels) + ')') return False # we need ordering, so a set's no good labels = list(labels) # next, assign -1 and 1 labelmapping = {labels[0]: -1, labels[1]: 1} # now for feature conversion # iter through instances, building a set of feature:type:str(value) triples svmfeatures = set() for (features, label) in featuresets: for k, v in compat.iteritems(features): svmfeatures.add(featurename(k, v)) # svmfeatures is indexable by integer svm feature number # svmfeatureindex is the inverse (svm feature name -> number) svmfeatures = list(svmfeatures) svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) # build svm feature set case by case svmfeatureset = [] for instance in featuresets: svmfeatureset.append( map_instance_to_svm(instance, labelmapping, svmfeatureindex)) # train the svm # TODO: implement passing of SVMlight parameters from train() to learn() return SvmClassifier( labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
def map_features_to_svm(features, svmfeatureindex): """ :param features: a dict of features in the format {'feature':value} :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels """ instancefeatures = [] # svmlight supports sparse feature sets and so we simply omit features that we don't include for k,v in compat.iteritems(features): # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0 # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label # use 1.0 as the feature value to specify the presence of a feature:value couple svmfeaturename = featurename(k, v) if svmfeaturename not in svmfeatureindex: # skip over feature:value pairs that were not in the training data and so not included in our mappings continue instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) ) return instancefeatures
def train(featuresets): """ given a set of training instances in nltk format: [ ( {feature:value, ..}, str(label) ) ] train a support vector machine :param featuresets: training instances """ _raise_if_svmlight_is_missing() # build a unique list of labels labels = set() for (features, label) in featuresets: labels.add(label) # this is a binary classifier only if len(labels) > 2: raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')') return False # we need ordering, so a set's no good labels = list(labels) # next, assign -1 and 1 labelmapping = {labels[0]:-1, labels[1]:1} # now for feature conversion # iter through instances, building a set of feature:type:str(value) triples svmfeatures = set() for (features, label) in featuresets: for k,v in compat.iteritems(features): svmfeatures.add(featurename(k, v)) # svmfeatures is indexable by integer svm feature number # svmfeatureindex is the inverse (svm feature name -> number) svmfeatures = list(svmfeatures) svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) # build svm feature set case by case svmfeatureset = [] for instance in featuresets: svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex)) # train the svm # TODO: implement passing of SVMlight parameters from train() to learn() return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
def map_features_to_svm(features, svmfeatureindex): """ :param features: a dict of features in the format {'feature':value} :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels """ instancefeatures = [] # svmlight supports sparse feature sets and so we simply omit features that we don't include for k, v in compat.iteritems(features): # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0 # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label # use 1.0 as the feature value to specify the presence of a feature:value couple svmfeaturename = featurename(k, v) if svmfeaturename not in svmfeatureindex: # skip over feature:value pairs that were not in the training data and so not included in our mappings continue instancefeatures.append((svmfeatureindex[svmfeaturename], 1.0)) return instancefeatures
def _featuresets_to_coo(self, featuresets): """Convert featuresets to sparse matrix (COO format).""" i_ind = [] j_ind = [] values = [] for i, fs in enumerate(featuresets): for f, v in compat.iteritems(fs): try: j = self._feature_index[f] i_ind.append(i) j_ind.append(j) values.append(self._dtype(v)) except KeyError: pass shape = (i + 1, len(self._feature_index)) return coo_matrix((values, (i_ind, j_ind)), shape=shape, dtype=self._dtype)