def compile_stats(self): self.sense_counts = {t: 0 for t in triage} self.wordclass = 0 self.levels = defaultdict(int) self.reasons = defaultdict(int) for parent_dir in self.directories: for t in triage: if t == 'unclassified' and 'iteration1' in parent_dir: pass else: dir = os.path.join(parent_dir, t) for letter in letters: pl = PickleLoader(dir, letters=letter) for sense in pl.iterate(): self.sense_counts[t] += 1 if t == 'classified': self.inspect_classification(sense) for t in triage: print('%s\t%d' % (t, self.sense_counts[t])) for l in sorted(self.levels.keys()): print('level %d\t%d' % (l, self.levels[l])) print('classified with wordclass: %d' % self.wordclass) print('\nREASON CODES:') for r in sorted(self.reasons.keys()): print('\t%s\t%d' % (r, self.reasons[r]))
def make_raw_index(in_dir): """ For every word forming the second element of a compound, map all the places in the thesaurus where its compounds occur. """ store = {wordclass: defaultdict(list) for wordclass in WORDCLASSES} for letter in string.ascii_uppercase: print('\tCompiling main sense data in %s...' % letter) loader = PickleLoader(in_dir, letters=letter) for s in loader.iterate(): if (s.wordclass in WORDCLASSES and s.first_word() and s.last_word()): if len(s.last_word()) < 3: pass elif s.wordclass == 'JJ' and s.last_word().endswith('ed'): pass else: target_list = store[s.wordclass][s.last_word()] score = '%0.2f' % (1 / len(s.thesaurus_nodes),) for leaf in s.thesaurus_nodes: target_list.append((leaf, score)) for wordclass in WORDCLASSES: filepath = os.path.join(DIRECTORY, wordclass + '_raw.csv') with open(filepath, 'w') as filehandle: csvwriter = csv.writer(filehandle) for lemma in sorted(list(store[wordclass].keys())): vals = store[wordclass][lemma] row = [lemma,] for id, score in vals: row.extend((id, score)) csvwriter.writerow(row)
def make_raw_index(self): store = {v: defaultdict(list) for v in ('genera', 'binomials')} loader = PickleLoader(self.input_dir) for s in loader.iterate(): if (s.wordclass == 'NN' and (s.binomials or s.genera)): for leaf in s.thesaurus_nodes: thesclass = tdb.get_thesclass(leaf) if any([thesclass.is_descendant_of(id) for id in life_branches]): for g in s.genera: store['genera'][g].append(leaf) for b in s.binomials: store['binomials'][b].append(leaf) genus = b.split(' ')[0] if genus not in s.genera: store['genera'][b.split(' ')[0]].append(leaf) for k in ('genera', 'binomials'): with open(self.raw_files[k], 'w') as filehandle: csvwriter = csv.writer(filehandle) for t, vals in store[k].items(): row = [t,] row.extend(vals) csvwriter.writerow(row)
def populate_senses(**kwargs): input = kwargs.get('input') out_dir = kwargs.get('out_dir') for letter in LETTERS: data = defaultdict(list) for parent_dir in input: for t in TRIAGE: if t == 'unclassified' and 'iteration1' in parent_dir: continue if t == 'classified': status = '1' elif t == 'unclassified': status = '0' elif t == 'intractable': status = 'n' dir = os.path.join(parent_dir, t) pl = PickleLoader(dir, letters=letter) for sense in pl.iterate(): row = _sense_to_row(sense, status) signature = (sense.entry_id, sense.node_id,) data[signature].append(row) output = [] for rows in data.values(): # Where there are multiple rows for a single sense, # we compare them to decide which are worth keeping if len(rows) > 1: rows = _compare_cloned_senses(rows) # Change clone_num to True/False for row in rows: if row[-1] == 0: row[-1] = False else: row[-1] = True # Append to the output that's going to be committed to # the database for row in rows: output.append(row) outfile = os.path.join(out_dir, letter + '.json') with open(outfile, 'w') as filehandle: for row in output: data = {fieldname: value for fieldname, value in zip(FIELDS['sense'], row)} filehandle.write(json.dumps(data)) filehandle.write('\n')
def compile_index(self): self.data = defaultdict(lambda: defaultdict(list)) letters = string.ascii_uppercase for letter in letters: print('\tIndexing superordinates in %s...' % letter) loader = PickleLoader(self.input_dir, letters=letter) for sense in loader.iterate(): if (sense.wordclass in ('NN', 'VB') and sense.superordinate is not None): self._process_superordinate(sense.superordinate, sense.thesaurus_nodes) if sense.superordinate != sense.superordinate_full: self._process_superordinate(sense.superordinate_full, sense.thesaurus_nodes) self._write_raw_index()
def update(self): for letter in letters: buffer = [] pl = PickleLoader(self.input_dir, letters=letter) for sense in pl.iterate(): if sense.definition is None: # don't bother with undefined lemmas pass else: instances = tdb.search(refentry=sense.entry_id, refid=sense.node_id) try: instance = instances[0] except IndexError: pass else: buffer.append((instance, sense.class_id)) if len(buffer) > 1000: tdb.add_links(buffer) buffer = [] tdb.add_links(buffer)
def make_raw_index(input_dir): """ Compile the raw compound index """ store = {wordclass: defaultdict(list) for wordclass in WORDCLASSES} for letter in string.ascii_uppercase: print('\tIndexing compound elements in %s...' % letter) loader = PickleLoader(input_dir, letters=letter) for s in loader.iterate(): if (s.wordclass in WORDCLASSES and s.first_word() is not None and s.last_word() is not None): first = s.first_word() last = s.last_word() if first in ('non', 'anti', 'to'): pass else: if len(last) >= 3: last = LIGHT_STEMMER.edit(last.lower()) for leaf in s.thesaurus_nodes: store[s.wordclass][last].append(leaf) if len(first) >= 3: first = LIGHT_STEMMER.edit(first.lower()) for leaf in s.thesaurus_nodes: store['first'][first].append(leaf) for wordclass in compoundindexerconfig.WORDCLASSES: filepath = os.path.join(OUTPUT_DIR, wordclass + '_raw.csv') with open(filepath, 'w') as filehandle: csvwriter = csv.writer(filehandle) for lemma, vals in sorted(store[wordclass].items()): if wordclass == 'first' and len(vals) == 1: pass else: row = [lemma, ] row.extend(vals) csvwriter.writerow(row)
def classify(self): running_totals = {t: 0 for t in triage} for letter in letters: print('\tClassifying %s (Iteration #%d)...' % (letter, self.iteration)) # Load Bayes evaluations for all the senses in this letter for name, manager in self.bayes.items(): if name == 'main': manager.load_results(letter) else: manager.load_results(letter, name) if self.mode == 'test': # Open file for tracing how compounds get classified trace_file = os.path.join(self.resources_dir, 'compounds', 'trace', letter + '.txt') self.compound_tracer = open(trace_file, 'w') self.buffer = {t: [] for t in triage} self.main_sense_of_entry = None self.previous_entry_id = 0 loader = PickleLoader(self.input_dir, letters=letter) for sense in loader.iterate(): # Determine whether this sense is considered tractable if sense.is_intractable(): intractable = True else: intractable = False # Plug in any results for this sense previously # obtained by the Bayes classifiers sense.bayes = BayesManager() for name, manager in self.bayes.items(): result = manager.seek_sense(sense.entry_id, sense.node_id) sense.bayes.insert(name, result) # Main classification process # (We only bother if it's a tractable sense) if not intractable: selected_class, runners_up = self._core_classifier(sense) else: selected_class, runners_up = (None, []) # Store the top Bayes classification for this sense try: bayes_classification = sense.bayes.ids()[0] except IndexError: bayes_classification = None bayes_confidence = sense.bayes.confidence() # Strip out any temporary attributes added to the sense # as part of the classifier's work. # (This saves space when re-pickling the sense) sense.strip_attributes() # ...then add back Bayes classification + confidence sense.bayes_classification = bayes_classification sense.bayes_confidence = bayes_confidence # Store result in the relevant buffer, and increment # running totals if intractable: self.buffer['intractable'].append(sense) running_totals['intractable'] += 1 elif selected_class is None: self.buffer['unclassified'].append(sense) running_totals['unclassified'] += 1 else: sense.class_id = selected_class.id sense.reason_text = selected_class.reason_text sense.reason_code = selected_class.reason_code #print(sense.lemma, sense.reason_code) sense.runners_up = runners_up self.buffer['classified'].append(sense) running_totals['classified'] += 1 # Update previous_entry with the current sense's # entry ID - so that on the next iteration we can check # if the parent entry has changed. self.previous_entry_id = sense.entry_id print('\t\t%s' % self._running_score(running_totals)) if self.mode != 'test': self.flush_buffer(letter) if self.mode == 'test': self.compound_tracer.close()