# Name: Retrieve # ------------------------------------------------------------------------------- import os import re import string from collections import defaultdict from lex.oed.projects.thesaurus.classifier.pickler.sensemanager import PickleLoader from lex.oed.projects.thesaurus.classifier.tracer import trace_class, trace_instance, trace_sense from lex.oed.projects.thesaurus.classifier.config import ThesaurusConfig config = ThesaurusConfig() parent_directories = [config.get("paths", "iteration1_dir"), config.get("paths", "iteration2_dir")] letters = string.ascii_uppercase while 1: print """ =========================================================== Enter lemma (optionally followed by '-c' or '-u' to specify classified or unclassified): """ lemma = raw_input(">>>") lemma = lemma.strip() if lemma.endswith(" -c"): dirs = ["classified"]
from __future__ import division, print_function from collections import defaultdict import string import numpy from lex.oed.projects.thesaurus.classifier.config import ThesaurusConfig from lex.oed.projects.thesaurus.classifier.bayes.bayesclassifier import BayesClassifier from lex.oed.projects.thesaurus.classifier.compounds.bayescompounds import BayesCompounds config = ThesaurusConfig() #bayes = BayesClassifier( # resources_dir=config.get('paths', 'resources_dir'), #) bayes = BayesCompounds( resources_dir=config.get('paths', 'resources_dir'), ) def spool(): for letter in string.ascii_lowercase: bayes.load_results(letter) for s in bayes.results.values(): s.recover_probabilities() #ad = s.average_delta(total_probability=.95) #if s.confidence() >= 7 and s.num_features() < 10: show_probabilities(s) def find_word(word): initial = word.lower()[0] bayes.load_results(initial, 'bias_high')
# Name: CheckLevels #------------------------------------------------------------------------------- import os import re import string from collections import defaultdict import lex.oed.thesaurus.thesaurusdb as tdb from lex.oed.projects.thesaurus.classifier.pickler.sensemanager import PickleLoader from lex.oed.projects.thesaurus.classifier.config import ThesaurusConfig config = ThesaurusConfig() training_dir = config.get('paths', 'classified_dir') parent_directories=[ config.get('paths', 'iteration1_dir'), config.get('paths', 'iteration2_dir'), ] def count_training(): counts = {i: 0 for i in range(17)} pl = PickleLoader(training_dir) for sense in pl.iterate(): for n in sense.thesaurus_nodes: thesclass = tdb.get_thesclass(n) counts[thesclass.level] += 1 for i in range(17): print '%d\t%d' % (i, counts[i])
from __future__ import division import os import numpy from lex.oed.projects.thesaurus.classifier.config import ThesaurusConfig from lex.oed.projects.thesaurus.classifier.bayes.classifiers_io import load_classifiers config = ThesaurusConfig() dir = os.path.join(config.get('paths', 'resources_dir'), 'bayes', 'classifiers') def variation(scores): mean = numpy.mean(scores) max_deviation = max([abs(max(scores)-mean), abs(min(scores)-mean)]) return max_deviation / mean prior_probabilities, classifiers = load_classifiers(dir, mode='raw') keywords = [(keyword, variation(scores.values())) for keyword, scores in classifiers.items() if keyword.startswith('T')] # Look for the smallest deviation between average and max/min value keywords.sort(key=lambda k: k[1]) for k in keywords[0:300]: print repr(k[0]), repr(k[1])