Пример #1
0
	def train(cls, labels, multi_label_feats, trainf, **train_kwargs):
		labelset = set(labels)
		label_feats = collections.defaultdict(list)
		pos_label_feats = collections.defaultdict(set)
		
		for feat, multi_labels in multi_label_feats:
			for label in multi_labels:
				label_feats[label].append((feat, True))
				# dicts are unhashable, so use a normalized tuple of key-values
				pos_label_feats[label].add(tuple(sorted(feat.items())))
			
			for label in labelset - set(multi_labels):
				label_feats[label].append((feat, False))
		
		for label in label_feats.keys():
			feats = []
			# this re-creates the feats list by ignoring any negative feat dicts
			# that are also in pos_label_feats[label] so we don't create
			# training conflicts
			for feat, l in label_feats[label]:
				if l or tuple(sorted(feat.items())) not in pos_label_feats[label]:
					feats.append((feat, l))
			
			label_feats[label] = feats
		
		label_classifiers = {}
		
		for label, feats in iteritems(label_feats):
			label_classifiers[label] = trainf(feats, **train_kwargs)
		
		return cls(label_classifiers)
Пример #2
0
    def train(cls, labels, multi_label_feats, trainf, **train_kwargs):
        labelset = set(labels)
        label_feats = collections.defaultdict(list)
        pos_label_feats = collections.defaultdict(set)

        for feat, multi_labels in multi_label_feats:
            for label in multi_labels:
                label_feats[label].append((feat, True))
                # dicts are unhashable, so use a normalized tuple of key-values
                pos_label_feats[label].add(tuple(sorted(feat.items())))

            for label in labelset - set(multi_labels):
                label_feats[label].append((feat, False))

        for label in label_feats.keys():
            feats = []
            # this re-creates the feats list by ignoring any negative feat dicts
            # that are also in pos_label_feats[label] so we don't create
            # training conflicts
            for feat, l in label_feats[label]:
                if l or tuple(sorted(
                        feat.items())) not in pos_label_feats[label]:
                    feats.append((feat, l))

            label_feats[label] = feats

        label_classifiers = {}

        for label, feats in iteritems(label_feats):
            label_classifiers[label] = trainf(feats, **train_kwargs)

        return cls(label_classifiers)
Пример #3
0
	def classify(self, feats):
		lbls = set()
		
		for label, classifier in iteritems(self._label_classifiers):
			if classifier.classify(feats) is True:
				lbls.add(label)
		
		return lbls
Пример #4
0
    def category_words():
        '''
		return an iteration of tuples of category and list of all words in instances of that category.
		Used if we are scoring the words for correlation to categories for feature selection (i.e.,
		score_fn and max_feats are set)
		'''
        return ((cat, (word for i in instance_list for word in i))
                for cat, instance_list in iteritems(train_instances))
Пример #5
0
    def classify(self, feats):
        lbls = set()

        for label, classifier in iteritems(self._label_classifiers):
            if classifier.classify(feats) is True:
                lbls.add(label)

        return lbls
Пример #6
0
	def feature_detector(self, tokens, index, history):
		feats = ClassifierBasedPOSTagger.feature_detector(self, tokens, index, history)
		s = tokens[index]
		
		for key, fun in iteritems(self.funs):
			feats[key] = fun(s)
		
		return feats
Пример #7
0
    def feature_detector(self, tokens, index, history):
        feats = ClassifierBasedPOSTagger.feature_detector(
            self, tokens, index, history)
        s = tokens[index]

        for key, fun in iteritems(self.funs):
            feats[key] = fun(s)

        return feats
Пример #8
0
def extract_features(label_instances, featx):
    if isinstance(label_instances, dict):
        # for not (args.multi and args.binary)
        # e.g., li = { 'spam': [ ['hello','world',...], ... ], 'ham': [ ['lorem','ipsum'...], ... ] }
        feats = []
        for label, instances in iteritems(label_instances):
            feats.extend([(featx(i), label) for i in instances])
    else:
        # for arg.multi and args.binary
        # e.g., li = [ (['hello','world',...],label1), (['lorem','ipsum'],label2) ]
        feats = [(featx(i), label) for i, label in label_instances]
    return feats
def extract_features(label_instances, featx):
	if isinstance(label_instances, dict):
		# for not (args.multi and args.binary)
        # e.g., li = { 'spam': [ ['hello','world',...], ... ], 'ham': [ ['lorem','ipsum'...], ... ] }
		feats = []
		for label, instances in iteritems(label_instances):
			feats.extend([(featx(i), label) for i in instances])
	else:
		# for arg.multi and args.binary
		# e.g., li = [ (['hello','world',...],label1), (['lorem','ipsum'],label2) ]
		feats = [(featx(i), label) for i, label in label_instances ]
	return feats
Пример #10
0
    def category_words():
        '''
		return an iteration of tuples of category and list of all words in instances of that category.
		Used if we are scoring the words for correlation to categories for feature selection (i.e.,
		score_fn and max_feats are set)
		'''
        cat_words = defaultdict([])
        for (words, cats) in train_instances:
            if isinstance(cats, collections.Iterable):
                for cat in cats:
                    cat_words[cat].extend(words)
            else:
                cat_words[cats].extend(words)
        return iteritems(cat_words)
Пример #11
0
	def category_words():
		'''
		return an iteration of tuples of category and list of all words in instances of that category.
		Used if we are scoring the words for correlation to categories for feature selection (i.e.,
		score_fn and max_feats are set)
		'''
		cat_words = defaultdict([])
		for (words, cats) in train_instances:
			if isinstance(cats, collections.Iterable):
				for cat in cats:
					cat_words[cat].extend(words)
			else:
				cat_words[cats].extend(words)
		return iteritems(cat_words)
Пример #12
0
def sum_category_word_scores(categorized_words, score_fn):
    word_fd = FreqDist()
    category_word_fd = ConditionalFreqDist()

    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in iteritems(category_word_fd[category]):
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores
def sum_category_word_scores(categorized_words, score_fn):
	word_fd = collections.Counter()
	category_word_fd = collections.defaultdict(collections.Counter)
	
	for category, words in categorized_words:
		for word in words:
			word_fd[word] += 1
			category_word_fd[category][word] += 1
	
	scores = collections.defaultdict(int)
	n_xx = sum(itertools.chain(*[fd.values() for fd in category_word_fd.values()]))
	
	for category in category_word_fd.keys():
		n_xi = sum(category_word_fd[category].values())
		
		for word, n_ii in iteritems(category_word_fd[category]):
			n_ix = word_fd[word]
			scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
	
	return scores
Пример #14
0
def sum_category_word_scores(categorized_words, score_fn):
	word_fd = FreqDist()
	category_word_fd = ConditionalFreqDist()
	
	for category, words in categorized_words:
		for word in words:
			word_fd.inc(word)
			category_word_fd[category].inc(word)
	
	scores = collections.defaultdict(int)
	n_xx = category_word_fd.N()
	
	for category in category_word_fd.conditions():
		n_xi = category_word_fd[category].N()
		
		for word, n_ii in iteritems(category_word_fd[category]):
			n_ix = word_fd[word]
			scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
	
	return scores
Пример #15
0
def sum_category_word_scores(categorized_words, score_fn):
    word_fd = collections.Counter()
    category_word_fd = collections.defaultdict(collections.Counter)

    for category, words in categorized_words:
        for word in words:
            word_fd[word] += 1
            category_word_fd[category][word] += 1

    scores = collections.defaultdict(int)
    n_xx = sum(
        itertools.chain(*[fd.values() for fd in category_word_fd.values()]))

    for category in category_word_fd.keys():
        n_xi = sum(category_word_fd[category].values())

        for word, n_ii in iteritems(category_word_fd[category]):
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores
Пример #16
0
	def category_words():
		'''
		return an iteration of tuples of category and list of all words in instances of that category.
		Used if we are scoring the words for correlation to categories for feature selection (i.e.,
		score_fn and max_feats are set)
		'''
		return ((cat, (word for i in instance_list for word in i)) for cat, instance_list in iteritems(train_instances))					
Пример #17
0
def cross_fold(instances,
               trainf,
               testf,
               folds=10,
               trace=1,
               metrics=True,
               informative=0):
    if folds < 2:
        raise ValueError('must have at least 3 folds')
    # ensure isn't an exhaustible iterable
    instances = list(instances)
    # randomize so get an even distribution, in case labeled instances are
    # ordered by label
    random.shuffle(instances)
    l = len(instances)
    step = int(l / folds)

    if trace:
        print('step %d over %d folds of %d instances' % (step, folds, l))

    accuracies = []
    precisions = collections.defaultdict(list)
    recalls = collections.defaultdict(list)
    f_measures = collections.defaultdict(list)

    for f in range(folds):
        if trace:
            print('\nfold %d' % (f + 1))
            print('-----%s' % ('-' * len('%s' % (f + 1))))

        start = f * step
        end = start + step
        train_instances = instances[:start] + instances[end:]
        test_instances = instances[start:end]

        if trace:
            print('training on %d:%d + %d:%d' % (0, start, end, l))

        obj = trainf(train_instances)

        if trace:
            print('testing on %d:%d' % (start, end))

        if metrics:
            refsets, testsets = ref_test_sets(obj, test_instances)

            for key in set(refsets.keys()) | set(testsets.keys()):
                ref = refsets[key]
                test = testsets[key]
                p = precision(ref, test) or 0
                r = recall(ref, test) or 0
                f = f_measure(ref, test) or 0
                precisions[key].append(p)
                recalls[key].append(r)
                f_measures[key].append(f)

                if trace:
                    print('%s precision: %f' % (key, p))
                    print('%s recall: %f' % (key, r))
                    print('%s f-measure: %f' % (key, f))

        accuracy = testf(obj, test_instances)

        if trace:
            print('accuracy: %f' % accuracy)

        accuracies.append(accuracy)

        if trace and informative and hasattr(obj,
                                             'show_most_informative_features'):
            obj.show_most_informative_features(informative)

    if trace:
        print('\nmean and variance across folds')
        print('------------------------------')
        print('accuracy mean: %f' % (sum(accuracies) / folds))
        print('accuracy variance: %f' % array(accuracies).var())

        for key, ps in iteritems(precisions):
            print('%s precision mean: %f' % (key, sum(ps) / folds))
            print('%s precision variance: %f' % (key, array(ps).var()))

        for key, rs in iteritems(recalls):
            print('%s recall mean: %f' % (key, sum(rs) / folds))
            print('%s recall variance: %f' % (key, array(rs).var()))

        for key, fs in iteritems(f_measures):
            print('%s f_measure mean: %f' % (key, sum(fs) / folds))
            print('%s f_measure variance: %f' % (key, array(fs).var()))

    return accuracies, precisions, recalls, f_measures
Пример #18
0
def cross_fold(instances, trainf, testf, folds=10, trace=1, metrics=True, informative=0):
	if folds < 2:
		raise ValueError('must have at least 3 folds')
	# ensure isn't an exhaustible iterable
	instances = list(instances)
	# randomize so get an even distribution, in case labeled instances are
	# ordered by label
	random.shuffle(instances)
	l = len(instances)
	step = l / folds
	
	if trace:
		print('step %d over %d folds of %d instances' % (step, folds, l))
	
	accuracies = []
	precisions = collections.defaultdict(list)
	recalls = collections.defaultdict(list)
	f_measures = collections.defaultdict(list)
	
	for f in range(folds):
		if trace:
			print('\nfold %d' % (f+1))
			print('-----%s' % ('-'*len('%s' % (f+1))))
		
		start = f * step
		end = start + step
		train_instances = instances[:start] + instances[end:]
		test_instances = instances[start:end]
		
		if trace:
			print('training on %d:%d + %d:%d' % (0, start, end, l))
		
		obj = trainf(train_instances)
		
		if trace:
			print('testing on %d:%d' % (start, end))
		
		if metrics:
			refsets, testsets = ref_test_sets(obj, test_instances)
			
			for key in set(refsets.keys() + testsets.keys()):
				ref = refsets[key]
				test = testsets[key]
				p = precision(ref, test) or 0
				r = recall(ref, test) or 0
				f = f_measure(ref, test) or 0
				precisions[key].append(p)
				recalls[key].append(r)
				f_measures[key].append(f)
				
				if trace:
					print('%s precision: %f' % (key, p))
					print('%s recall: %f' % (key, r))
					print('%s f-measure: %f' % (key, f))
		
		accuracy = testf(obj, test_instances)
		
		if trace:
			print('accuracy: %f' % accuracy)
		
		accuracies.append(accuracy)
		
		if trace and informative and hasattr(obj, 'show_most_informative_features'):
			obj.show_most_informative_features(informative)
	
	if trace:
		print('\nmean and variance across folds')
		print('------------------------------')
		print('accuracy mean: %f' % (sum(accuracies) / folds))
		print('accuracy variance: %f' % array(accuracies).var())
		
		for key, ps in iteritems(precisions):
			print('%s precision mean: %f' % (key, sum(ps) / folds))
			print('%s precision variance: %f' % (key, array(ps).var()))
		
		for key, rs in iteritems(recalls):
			print('%s recall mean: %f' % (key, sum(rs) / folds))
			print('%s recall variance: %f' % (key, array(rs).var()))
		
		for key, fs in iteritems(f_measures):
			print('%s f_measure mean: %f' % (key, sum(fs) / folds))
			print('%s f_measure variance: %f' % (key, array(fs).var()))
	
	return accuracies, precisions, recalls, f_measures