示例#1
0
    def compile_stats(self):
        self.sense_counts = {t: 0 for t in triage}
        self.wordclass = 0
        self.levels = defaultdict(int)
        self.reasons = defaultdict(int)
        for parent_dir in self.directories:
            for t in triage:
                if t == 'unclassified' and 'iteration1' in parent_dir:
                    pass
                else:
                    dir = os.path.join(parent_dir, t)
                    for letter in letters:
                        pl = PickleLoader(dir, letters=letter)
                        for sense in pl.iterate():
                            self.sense_counts[t] += 1
                            if t == 'classified':
                                self.inspect_classification(sense)

        for t in triage:
            print('%s\t%d' % (t, self.sense_counts[t]))
        for l in sorted(self.levels.keys()):
            print('level %d\t%d' % (l, self.levels[l]))
        print('classified with wordclass: %d' % self.wordclass)

        print('\nREASON CODES:')
        for r in sorted(self.reasons.keys()):
            print('\t%s\t%d' % (r, self.reasons[r]))
def make_raw_index(in_dir):
    """
    For every word forming the second element of a compound, map all
    the places in the thesaurus where its compounds occur.
    """
    store = {wordclass: defaultdict(list) for wordclass in WORDCLASSES}
    for letter in string.ascii_uppercase:
        print('\tCompiling main sense data in %s...' % letter)
        loader = PickleLoader(in_dir, letters=letter)
        for s in loader.iterate():
            if (s.wordclass in WORDCLASSES and
                    s.first_word() and
                    s.last_word()):
                if len(s.last_word()) < 3:
                    pass
                elif s.wordclass == 'JJ' and s.last_word().endswith('ed'):
                    pass
                else:
                    target_list = store[s.wordclass][s.last_word()]
                    score = '%0.2f' % (1 / len(s.thesaurus_nodes),)
                    for leaf in s.thesaurus_nodes:
                        target_list.append((leaf, score))

    for wordclass in WORDCLASSES:
        filepath = os.path.join(DIRECTORY, wordclass + '_raw.csv')
        with open(filepath, 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for lemma in sorted(list(store[wordclass].keys())):
                vals = store[wordclass][lemma]
                row = [lemma,]
                for id, score in vals:
                    row.extend((id, score))
                csvwriter.writerow(row)
示例#3
0
    def make_raw_index(self):
        store = {v: defaultdict(list) for v in ('genera', 'binomials')}
        loader = PickleLoader(self.input_dir)
        for s in loader.iterate():
            if (s.wordclass == 'NN' and
                    (s.binomials or s.genera)):
                for leaf in s.thesaurus_nodes:
                    thesclass = tdb.get_thesclass(leaf)
                    if any([thesclass.is_descendant_of(id) for id in
                            life_branches]):
                        for g in s.genera:
                            store['genera'][g].append(leaf)
                        for b in s.binomials:
                            store['binomials'][b].append(leaf)
                            genus = b.split(' ')[0]
                            if genus not in s.genera:
                                store['genera'][b.split(' ')[0]].append(leaf)

        for k in ('genera', 'binomials'):
            with open(self.raw_files[k], 'w') as filehandle:
                csvwriter = csv.writer(filehandle)
                for t, vals in store[k].items():
                   row = [t,]
                   row.extend(vals)
                   csvwriter.writerow(row)
示例#4
0
def populate_senses(**kwargs):
    input = kwargs.get('input')
    out_dir = kwargs.get('out_dir')

    for letter in LETTERS:
        data = defaultdict(list)
        for parent_dir in input:
            for t in TRIAGE:
                if t == 'unclassified' and 'iteration1' in parent_dir:
                    continue

                if t == 'classified':
                    status = '1'
                elif t == 'unclassified':
                    status = '0'
                elif t == 'intractable':
                    status = 'n'

                dir = os.path.join(parent_dir, t)
                pl = PickleLoader(dir, letters=letter)
                for sense in pl.iterate():
                    row = _sense_to_row(sense, status)
                    signature = (sense.entry_id, sense.node_id,)
                    data[signature].append(row)

        output = []
        for rows in data.values():
            # Where there are multiple rows for a single sense,
            #  we compare them to decide which are worth keeping
            if len(rows) > 1:
                rows = _compare_cloned_senses(rows)
            # Change clone_num to True/False
            for row in rows:
                if row[-1] == 0:
                    row[-1] = False
                else:
                    row[-1] = True
            # Append to the output that's going to be committed to
            #  the database
            for row in rows:
                output.append(row)

        outfile = os.path.join(out_dir, letter + '.json')
        with open(outfile, 'w') as filehandle:
            for row in output:
                data = {fieldname: value for fieldname, value in
                        zip(FIELDS['sense'], row)}
                filehandle.write(json.dumps(data))
                filehandle.write('\n')
 def compile_index(self):
     self.data = defaultdict(lambda: defaultdict(list))
     letters = string.ascii_uppercase
     for letter in letters:
         print('\tIndexing superordinates in %s...' % letter)
         loader = PickleLoader(self.input_dir, letters=letter)
         for sense in loader.iterate():
             if (sense.wordclass in ('NN', 'VB') and
                     sense.superordinate is not None):
                 self._process_superordinate(sense.superordinate,
                                             sense.thesaurus_nodes)
                 if sense.superordinate != sense.superordinate_full:
                     self._process_superordinate(sense.superordinate_full,
                                                 sense.thesaurus_nodes)
     self._write_raw_index()
示例#6
0
 def update(self):
     for letter in letters:
         buffer = []
         pl = PickleLoader(self.input_dir, letters=letter)
         for sense in pl.iterate():
             if sense.definition is None:
                 # don't bother with undefined lemmas
                 pass
             else:
                 instances = tdb.search(refentry=sense.entry_id,
                                        refid=sense.node_id)
                 try:
                     instance = instances[0]
                 except IndexError:
                     pass
                 else:
                     buffer.append((instance, sense.class_id))
                     if len(buffer) > 1000:
                         tdb.add_links(buffer)
                         buffer = []
         tdb.add_links(buffer)
示例#7
0
def make_raw_index(input_dir):
    """
    Compile the raw compound index
    """
    store = {wordclass: defaultdict(list) for wordclass in WORDCLASSES}
    for letter in string.ascii_uppercase:
        print('\tIndexing compound elements in %s...' % letter)
        loader = PickleLoader(input_dir, letters=letter)
        for s in loader.iterate():
            if (s.wordclass in WORDCLASSES and
                    s.first_word() is not None and
                    s.last_word() is not None):
                first = s.first_word()
                last = s.last_word()
                if first in ('non', 'anti', 'to'):
                    pass
                else:
                    if len(last) >= 3:
                        last = LIGHT_STEMMER.edit(last.lower())
                        for leaf in s.thesaurus_nodes:
                            store[s.wordclass][last].append(leaf)
                    if len(first) >= 3:
                        first = LIGHT_STEMMER.edit(first.lower())
                        for leaf in s.thesaurus_nodes:
                            store['first'][first].append(leaf)

    for wordclass in compoundindexerconfig.WORDCLASSES:
        filepath = os.path.join(OUTPUT_DIR, wordclass + '_raw.csv')
        with open(filepath, 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for lemma, vals in sorted(store[wordclass].items()):
                if wordclass == 'first' and len(vals) == 1:
                    pass
                else:
                    row = [lemma, ]
                    row.extend(vals)
                    csvwriter.writerow(row)
示例#8
0
    def classify(self):
        running_totals = {t: 0 for t in triage}
        for letter in letters:
            print('\tClassifying %s (Iteration #%d)...' % (letter, self.iteration))

            # Load Bayes evaluations for all the senses in this letter
            for name, manager in self.bayes.items():
                if name == 'main':
                    manager.load_results(letter)
                else:
                    manager.load_results(letter, name)

            if self.mode == 'test':
                # Open file for tracing how compounds get classified
                trace_file = os.path.join(self.resources_dir,
                                          'compounds',
                                          'trace',
                                          letter + '.txt')
                self.compound_tracer = open(trace_file, 'w')

            self.buffer = {t: [] for t in triage}
            self.main_sense_of_entry = None
            self.previous_entry_id = 0
            loader = PickleLoader(self.input_dir, letters=letter)

            for sense in loader.iterate():
                # Determine whether this sense is considered tractable
                if sense.is_intractable():
                    intractable = True
                else:
                    intractable = False

                # Plug in any results for this sense previously
                #  obtained by the Bayes classifiers
                sense.bayes = BayesManager()
                for name, manager in self.bayes.items():
                    result = manager.seek_sense(sense.entry_id, sense.node_id)
                    sense.bayes.insert(name, result)

                # Main classification process
                #  (We only bother if it's a tractable sense)
                if not intractable:
                    selected_class, runners_up = self._core_classifier(sense)
                else:
                    selected_class, runners_up = (None, [])

                # Store the top Bayes classification for this sense
                try:
                    bayes_classification = sense.bayes.ids()[0]
                except IndexError:
                    bayes_classification = None
                bayes_confidence = sense.bayes.confidence()

                # Strip out any temporary attributes added to the sense
                #  as part of the classifier's work.
                #  (This saves space when re-pickling the sense)
                sense.strip_attributes()
                # ...then add back Bayes classification + confidence
                sense.bayes_classification = bayes_classification
                sense.bayes_confidence = bayes_confidence

                # Store result in the relevant buffer, and increment
                #  running totals
                if intractable:
                    self.buffer['intractable'].append(sense)
                    running_totals['intractable'] += 1
                elif selected_class is None:
                    self.buffer['unclassified'].append(sense)
                    running_totals['unclassified'] += 1
                else:
                    sense.class_id = selected_class.id
                    sense.reason_text = selected_class.reason_text
                    sense.reason_code = selected_class.reason_code
                    #print(sense.lemma, sense.reason_code)
                    sense.runners_up = runners_up
                    self.buffer['classified'].append(sense)
                    running_totals['classified'] += 1

                # Update previous_entry with the current sense's
                #  entry ID - so that on the next iteration we can check
                #  if the parent entry has changed.
                self.previous_entry_id = sense.entry_id

            print('\t\t%s' % self._running_score(running_totals))
            if self.mode != 'test':
                self.flush_buffer(letter)

            if self.mode == 'test':
                self.compound_tracer.close()