示例#1
0
def refine_by_binomial(candidate_classifications, qbinomials):
    """
    Refine classification using binomials found in quotations.
    """
    def test_descent(current_class, possible_refinements):
        new_class = None
        for binomial_class in possible_refinements:
            if (binomial_class != current_class and
                    binomial_class.is_descendant_of(current_class)):
                new_class = binomial_class
                break
        if new_class is not None:
            new_class.reason_code = current_class.reason_code
            new_class.reason_text = current_class.reason_text
            return new_class
        else:
            return current_class

    binomial_ids = set([binomial_checker.find_class(b) for b in qbinomials])
    binomials = [tdb.get_thesclass(class_id) for class_id in binomial_ids
                 if class_id is not None]
    if binomials:
        candidate_classifications = [test_descent(t, binomials) for t in
                                     candidate_classifications]
    return candidate_classifications
示例#2
0
    def make_raw_index(self):
        store = {v: defaultdict(list) for v in ('genera', 'binomials')}
        loader = PickleLoader(self.input_dir)
        for s in loader.iterate():
            if (s.wordclass == 'NN' and
                    (s.binomials or s.genera)):
                for leaf in s.thesaurus_nodes:
                    thesclass = tdb.get_thesclass(leaf)
                    if any([thesclass.is_descendant_of(id) for id in
                            life_branches]):
                        for g in s.genera:
                            store['genera'][g].append(leaf)
                        for b in s.binomials:
                            store['binomials'][b].append(leaf)
                            genus = b.split(' ')[0]
                            if genus not in s.genera:
                                store['genera'][b.split(' ')[0]].append(leaf)

        for k in ('genera', 'binomials'):
            with open(self.raw_files[k], 'w') as filehandle:
                csvwriter = csv.writer(filehandle)
                for t, vals in store[k].items():
                   row = [t,]
                   row.extend(vals)
                   csvwriter.writerow(row)
def winnow(superordinate, values):
    # Convert thesaurus IDs stored in the raw files to actual thesaurus classes
    thesclasses = [(tdb.get_thesclass(v[0]), v[1]) for v in values]
    if superordinate.endswith('VB'):
        thesclasses = [t for t in thesclasses if t[0].penn_wordclass() == 'VB']
    else:
        thesclasses = [t for t in thesclasses if t[0].penn_wordclass() == 'NN']

    # If there's only one sense, we can short-cut the winnowing process
    if len(thesclasses) <= 1:
        return [(t[0], 1) for t in thesclasses]

    # Use the hopper function to winnow out classes that contain less than
    #  25% of the total number of senses
    total = sum([t[1] for t in thesclasses])
    winnowed = _hopper(thesclasses, total, 0.25)
    winnowed.sort(key=lambda a: a[1], reverse=True)
    winnowed.sort(key=lambda a: a[2], reverse=True)

    # Strip out classes that are just parents of other classes
    parents = {t[0].parent.id: 0 for t in winnowed}
    # Keep a tally of how many of the parent's senses are covered
    #   by its children
    for t in winnowed:
        parents[t[0].parent.id] += t[1]
    # Remove a class if it's a parent of other classes and its total tally
    #  of senses is not much more than the sum of its children's (we use
    #  a margin of 1.3 to allow for some wastage through child classes that
    #  have been skipped)
    parent_stripped = [t for t in winnowed if not t[0].id in parents or
                       t[1] > parents[t[0].id] * 1.3]

    return [(t[0], t[1] / total) for t in parent_stripped]
 def sample_to_csv(self, name, size, function):
     self.collect_sample(name, size, function)
     out_file = os.path.join(self.out_dir, name + '.csv')
     with open(out_file, 'wb') as fh:
         csvwriter = csv.writer(fh)
         csvwriter.writerow(columns)
         for sense in self.sample:
             if sense.definition is not None:
                 definition = sense.definition[0:200]
                 if definition.startswith('='):
                     definition = '.' + definition
             else:
                 definition = '[undefined]'
             thesclass = tdb.get_thesclass(sense.class_id)
             if thesclass.wordclass is None:
                 wordclass_level = 'n'
             else:
                 wordclass_level = 'y'
             row = (
                 sense.lemma.encode('utf8'),
                 sense.wordclass,
                 definition.encode('utf8'),
                 thesclass.id,
                 thesclass.breadcrumb().encode('utf8'),
                 wordclass_level,
                 '',
                 sense.oed_url(),
                 thesclass.oed_url(),
                 sense.reason_code,
             )
             csvwriter.writerow(row)
def refine_index():
    """
    For the lists of thesaurus nodes generated by make_raw_index(), try to
    pin down to branches where there are particular clusters (throwing
    away outliers, etc.).
    """
    for wordclass in WORDCLASSES:
        lemmas = []
        filepath = os.path.join(DIRECTORY, wordclass + '_raw.csv')
        with open(filepath, 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                lemma = row[0]
                values = row[1:]
                ids = [int(id) for id in values[::2]]
                scores = [float(s) for s in values[1::2]]
                if sum(scores) >= 4:
                    idmap = defaultdict(int)
                    for id, score in zip(ids, scores):
                        idmap[id] += score
                    lemmas.append((lemma, Counter(idmap).most_common()))

        store = []
        for lemma, idcounter in lemmas:
            classes = [(tdb.get_thesclass(id), score)
                       for id, score in idcounter]
            total_score = sum([c[1] for c in classes])
            ancestors = defaultdict(int)
            for thesclass, score in classes:
                a = thesclass.ancestor(level=3)
                if a:
                    ancestors[a] += score
            l3_ancestors = Counter(ancestors).most_common()

            if (l3_ancestors and
                    l3_ancestors[0][1] > total_score * 0.3 and
                    (len(l3_ancestors) == 1 or
                    l3_ancestors[0][1] > l3_ancestors[1][1] * 1.3)):
                parent_branch = l3_ancestors[0][0]
                ancestors = defaultdict(int)
                for thesclass, score in classes:
                    a = thesclass.ancestor(level=4)
                    if a and a.is_descendant_of(parent_branch):
                        ancestors[a] += score
                l4_ancestors = Counter(ancestors).most_common()
                if l4_ancestors and l4_ancestors[0][1] > total_score * 0.3:
                    target = l4_ancestors[0][0]
                else:
                    target = parent_branch
                store.append((lemma, target))

        outfile = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass)
        with open(outfile, 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for lemma, thesclass in store:
                row = (lemma, thesclass.id, thesclass.breadcrumb())
                csvwriter.writerow(row)
示例#6
0
def count_training():
    counts = {i: 0 for i in range(17)}
    pl = PickleLoader(training_dir)
    for sense in pl.iterate():
        for n in sense.thesaurus_nodes:
            thesclass = tdb.get_thesclass(n)
            counts[thesclass.level] += 1
    for i in range(17):
        print '%d\t%d' % (i, counts[i])
示例#7
0
def compare_binomials(sense):
    if sense.wordclass != 'NN':
        return None
    match = None
    for b in sense.binomials:
        class_id = binomial_checker.find_class(b)
        if class_id is not None:
            match = tdb.get_thesclass(class_id)
    if match is None:
        for g in sense.genera:
            class_id = binomial_checker.find_class(g)
            if class_id is not None:
                match = tdb.get_thesclass(class_id)
    if match is not None:
        match.reason_text = 'Taxonomic name: %s' % ', '.join(
            sense.binomials.union(sense.genera))
        match.reason_code = 'txny'
        return match
    else:
        return None
def finalize():
    """
    Use the 'manual' file to override where necessary
    """
    for wordclass in WORDCLASSES:
        lemmas = {}
        infile1 = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass)
        infile2 = os.path.join(DIRECTORY, '%s_manual.csv' % wordclass)
        with open(infile1, 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                lemmas[row[0]] = int(row[1])
        # Do the manual file second, so that it overrides the
        #  automatically-generated file
        with open(infile2, 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                lemmas[row[0]] = int(row[1])

        output = []
        for lemma, class_id in lemmas.items():
            # Retrieve the branch that the majority of compounds are on
            compound_branch = tdb.get_thesclass(class_id)

            # Get the highest-rated senses for the lemma
            ranked_senses = tdb.ranked_search(lemma=lemma, wordclass=wordclass)
            if ranked_senses:
                max_rating = ranked_senses[0].rating()
                ranked_senses = [s for s in ranked_senses if
                                 max_rating > 0 and
                                 s.rating() > max_rating * 0.3]

            # Try filtering to just those senses that are on
            #   the same branch as the compounds
            ranked_filtered = [s for i, s in enumerate(ranked_senses) if
                               (i == 0 and s.thesclass is None) or
                               s.is_descendant_of(compound_branch)]
            # ... or else stick with original ranking
            if not ranked_filtered:
                ranked_filtered = ranked_senses

            if ranked_filtered:
                output.append(ranked_filtered[0])

        outfile = os.path.join(DIRECTORY, '%s.csv' % wordclass)
        output.sort(key=lambda s: s.lemma)
        with open(outfile, 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for s in output:
                row = (s.lemma, s.refentry, s.refid,
                       s.entry_size, s.breadcrumb())
                csvwriter.writerow(row)
示例#9
0
def _bayes_mismatch(sense):
    try:
        sense.bayes_classification
    except AttributeError:
        return False
    try:
        sense.class_id
    except AttributeError:
        return False

    if (sense.class_id is None or
            sense.bayes_classification is None or
            sense.bayes_confidence <= 3):
        return False

    selected_class = tdb.get_thesclass(sense.class_id)
    bayes_class = tdb.get_thesclass(sense.bayes_classification)
    if bayes_class.level > 3:
        bayes_class = bayes_class.ancestor(level=3)
    if selected_class.is_descendant_of(bayes_class):
        return False
    else:
        return True
示例#10
0
def count_classified():
    counts = {i: 0 for i in range(17)}
    for p in parent_directories:
        subdir = os.path.join(p, 'classified')
        pl = PickleLoader(subdir)
        for sense in pl.iterate():
            try:
                sense.class_id
            except AttributeError:
                pass
            else:
                thesclass = tdb.get_thesclass(sense.class_id)
                counts[thesclass.level] += 1
    for i in range(17):
        print '%d\t%d' % (i, counts[i])
示例#11
0
    def inspect_classification(self, sense):
        if sense.class_id not in Statistics.cache:
            thesclass = tdb.get_thesclass(sense.class_id)
            Statistics.cache[sense.class_id] = {'l': thesclass.level, 'w': False}
            if thesclass.wordclass is not None:
                Statistics.cache[sense.class_id]['w'] = True
        self.levels[Statistics.cache[sense.class_id]['l']] += 1
        if Statistics.cache[sense.class_id]['w']:
            self.wordclass += 1

        try:
            sense.reason_code
        except AttributeError:
            pass
        else:
            self.reasons[sense.reason_code] += 1
示例#12
0
def drilldown(vals):
    thesclasses = [tdb.get_thesclass(v) for v in vals]
    thesclasses = [t for t in thesclasses if t.wordclass == 'NN' or
                   t.wordclass == 'noun']
    branch = living_world_node
    for lev in (4, 5, 6, 7, 8, 9):
        level_ancestors = [t.ancestor(level=lev) for t in thesclasses]
        level_ancestors = [a for a in level_ancestors if a is not None and
                           a.is_descendant_of(branch)]
        if level_ancestors:
            histogram = Counter(level_ancestors).most_common()
            most_common = [t[0] for t in histogram if t[1] == histogram[0][1]]
            #print str(lev), str(len(most_common))
            if len(most_common) > 1:
                break
            branch = most_common[0]
            if branch.wordclass is not None:
                break
        else:
            break
    return branch
示例#13
0
    def refine_binomial_index(self):
        # Load the genus terms and their branch
        genera = {}
        with open(self.clean_files['genera'], 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                genera[row[0]] = int(row[1])

        # load the raw binomials data
        binomials = []
        with open(self.raw_files['binomials'], 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                b = row.pop(0)
                ids = [int(id) for id in row]
                binomials.append((b, ids))

        # Trim down to just those thesaurus classes that are inside the genus
        # term's branch
        binomials2 = []
        for b in binomials:
            binomial = b[0]
            genus = binomial.split(' ')[0]
            thesclasses = [tdb.get_thesclass(v) for v in b[1]]
            if genus in genera:
                thesclasses = [t for t in thesclasses if
                    t.is_descendant_of(genera[genus])]
            # Of the remainder, pick the largest branch
            if thesclasses:
                histogram = Counter(thesclasses).most_common()
                most_common = [t[0] for t in histogram if t[1] == histogram[0][1]]
                most_common.sort(key=lambda t: t.branch_size, reverse=True)
                binomials2.append((binomial, most_common[0]))

        with open(self.clean_files['binomials'], 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for t, v in binomials2:
                row = [t, v.id, v.breadcrumb()]
                csvwriter.writerow(row)
import lex.oed.thesaurus.thesaurusdb as tdb

from resources.mainsense.mainsense import MainSense
from ..indexer.compoundindexretriever import retrieve_from_compound_index
from ..bayes.computebayesconsensus import compute_bayes_consensus
from .computebestguesses import compute_best_guesses
from classifyengine.rankedsensesummary import ranked_sense_summary

WORDCLASSES = ('NN', 'JJ', 'RB', 'first')
MAIN_SENSE_FINDER = MainSense()

# Living world, abstract properties, relative properties - dangerous
#  classes since very vague and miscellaneous
DANGER_BRANCHES = {8835, 82596, 111290}

PARASYN_ENDINGS = {word: tdb.get_thesclass(class_id)
                   for word, class_id in (('shaped', 98385),
                                          ('colour', 81487),
                                          ('coloured', 81487))}

SIMILATIVE = {'like', 'wise', 'based', 'containing', 'form', 'formed', 'free'}

# Don't attempt compounds where either word is one of these:
STOPWORDS = {'of', 'a', 'an', 'in', 'to', 'the', 'by', 'for', 'less'}


def formal_compound_analysis(sense, entry_main_sense):
    """
    Figure out a likely thesaurus class based on the form of a
    two-part compound lemma.
示例#15
0
 def node(self):
     try:
         return self._node
     except AttributeError:
         self._node = tdb.get_thesclass(self.id)
         return self._node
示例#16
0
 def exact_node(self):
     try:
         return self._exact_node
     except AttributeError:
         self._exact_node = tdb.get_thesclass(self.exact_id)
         return self._exact_node
示例#17
0
def trace_sense(sense):
    lines = ['--------------------------------------------------',]
    lines.append('"%s" %s  (%d#eid%d)' % (sense.lemma, sense.wordclass,
        sense.entry_id, sense.node_id))
    lines.append('"%s"' % sense.definition)
    lines.append('"%s"' % sense.gloss)
    if sense.subjects:
        lines.append('subjects: ' + ', '.join(['"%s"' % s for s in sense.subjects]))
    if sense.etyma:
        lines.append('etyma: ' + ', '.join(['"%s"' % e[0] for e in sense.etyma]))

    try:
        sense.superordinate
    except AttributeError:
        pass
    else:
        if sense.superordinate:
            lines.append('superordinate: %s  (%s)' % (
                sense.superordinate, sense.superordinate_full))

    try:
        sense.synonyms
    except AttributeError:
        pass
    else:
        if sense.synonyms:
            lines.append('synonyms:' + ', '.join(['"%s"' % s
                for s in sense.synonyms]))

    try:
        sense.noun_phrases
    except AttributeError:
        pass
    else:
        if sense.noun_phrases:
            lines.append('NPs:' + ', '.join(['"%s"' % np for np in
                sense.noun_phrases]))

    try:
        sense.bayes
    except AttributeError:
        pass
    else:
        for thesclass in sense.bayes.branches(max_delta=0.3):
            lines.append('Bayes: %s' % thesclass.breadcrumb())

    try:
        sense.class_id
    except AttributeError:
        pass
    else:
        thesclass = tdb.get_thesclass(sense.class_id)
        lines.append('>>>')
        lines.append(trace_class(thesclass))

        try:
            sense.reason_code
        except AttributeError:
            pass
        else:
            if sense.reason_code is not None:
                lines.append('Reason code: %s' % sense.reason_code)

        try:
            sense.reason_text
        except AttributeError:
            pass
        else:
            if sense.reason_text is not None:
                lines.append('Reason: %s' % sense.reason_text)

    lines = [simples.sub('?', l) for l in lines]
    return '\n\t'.join(lines)
示例#18
0
def _sense_to_row(sense, status):
    if sense.definition is None:
        undefined = True
        definition = None
    else:
        undefined = False
        definition = sense.definition[:200]

    if sense.definition_supplement:
        definition_supplement = sense.definition_supplement[:150]
    else:
        definition_supplement = None

    try:
        reasoncode = sense.reason_code
    except AttributeError:
        reasoncode = None
    try:
        reasontext = sense.reason_text[:200]
    except (AttributeError, TypeError):
        reasontext = None

    try:
        thesclass1_id = sense.class_id
    except AttributeError:
        thesclass1_id = None
    try:
        thesclass2_id = sense.runners_up[0]
    except (AttributeError, IndexError):
        thesclass2_id = None
    try:
        thesclass3_id = sense.runners_up[1]
    except (AttributeError, IndexError):
        thesclass3_id = None

    if thesclass1_id is not None:
        thesclass = tdb.get_thesclass(thesclass1_id)
        level2branch = thesclass.ancestor(level=2)
        checkstatus = 'u'
    else:
        level2branch = None
        checkstatus = 'n'

    if level2branch is not None:
        level2branch_id = level2branch.id
    else:
        level2branch_id = None

    try:
        bayes = sense.bayes_classification
        bayes_confidence = sense.bayes_confidence
    except AttributeError:
        bayes = None
        bayes_confidence = 0

    row = [
        status,
        sense.lemma[:100],
        lexical_sort(sense.lemma)[:100],
        sense.wordclass or 'NN',
        definition,
        definition_supplement,
        sense.entry_id,
        sense.node_id,
        sense.entry_lemma[:50],
        lexical_sort(sense.entry_lemma)[:50],
        sense.subentry_type or 'main sense',
        undefined,
        random.randint(0, 10000),  # sample order
        bayes,
        bayes_confidence,
        _bayes_mismatch(sense),
        thesclass1_id,
        thesclass2_id,
        thesclass3_id,
        'u',  # checkbox for thesclass1 (unset)
        'i',  # checkbox for thesclass2 (incorrect)
        'i',  # checkbox for thesclass3 (incorrect)
        checkstatus,
        level2branch_id,
        reasontext,
        reasoncode,
        sense.clone_num,  # Gets changed to True/False before committing to DB
    ]
    return row
示例#19
0
def winnow(class_ids, wordclass):
    # Convert thesaurus IDs stored in the raw files to actual thesaurus classes
    thesclasses = [tdb.get_thesclass(id) for id in class_ids]
    if wordclass == 'NN':
        thesclasses = [t for t in thesclasses if t.wordclass == 'noun']
    elif wordclass == 'JJ':
        thesclasses = [t for t in thesclasses if t.wordclass == 'adjective']
    elif wordclass == 'RB':
        thesclasses = [t for t in thesclasses if t.wordclass == 'adverb']

    # Keep a note of the total number of instances of this word in compounds
    # (before we start winnowing out stuff)
    total = len(thesclasses)

    # Group into wordclass-level parent classes
    wordclass_groups = {}
    for t in thesclasses:
        p = t.wordclass_parent() or t
        if not p.id in wordclass_groups:
            wordclass_groups[p.id] = (p, [])
        wordclass_groups[p.id][1].append(t)
    # Reduce to a list of (parent_node, child_nodes) tuples
    wordclass_groups = list(wordclass_groups.values())
    # Sort so that the most common is first
    wordclass_groups.sort(key=lambda row: row[0].level)
    wordclass_groups.sort(key=lambda row: len(row[1]), reverse=True)

    # For each wordclass group, find the best child node to use
    #  (which may often be the wordclass node itself)
    wordclass_groups2 = []
    for parent_node, child_nodes in wordclass_groups:
        # If there's only one child node, or if any of the child nodes
        #  are at wordclass level, then we'll just use the wordclass level
        if (len(child_nodes) == 1 or
                any([t.id == parent_node.id for t in child_nodes])):
            best_child = parent_node
        # If all the children are on the same node, then we'll use that node
        elif len(set([t.id for t in child_nodes])) == 1:
            best_child = child_nodes[0]
        # ... Otherwise, poll to find the leading one out of the classes
        #  below wordclass level
        else:
            best_child = None
            for depth in (2, 1):
                # Find the level immediately below the parent wordclass level
                sub_parent_level = parent_node.level + depth
                # ... and count how many children are on each branch
                # at this level
                counts = Counter([t.ancestor(level=sub_parent_level)
                                  for t in child_nodes]).most_common()
                max_count = counts[0][1]
                if max_count >= len(child_nodes) * 0.8:
                    best_child = counts[0][0]
                elif depth == 1:
                    best = [c[0] for c in counts if c[1] == max_count]
                    # If there's a clear winner, we use that; otherwise, we
                    #  revert to using the parent node as a fallback
                    if len(best) == 1:
                        best_child = best[0]
                    else:
                        best_child = parent_node
                if best_child is not None:
                    break
        wordclass_groups2.append((parent_node, len(child_nodes), best_child))

    # Group into level-3 classes
    level3_groups = {}
    for g in wordclass_groups2:
        wordclass_parent = g[0]
        p = wordclass_parent.ancestor(level=3) or wordclass_parent
        if not p.id in level3_groups:
            level3_groups[p.id] = (p, [])
        level3_groups[p.id][1].append(g)
    # Reduce to a list of (parent_node, count, child_groups) tuples
    level3_groups = level3_groups.values()
    level3_groups = [(row[0], sum([g[1] for g in row[1]]), row[1],)
                     for row in level3_groups]
    # Sort so that the most common is first
    level3_groups.sort(key=lambda row: row[1], reverse=True)

    # Drop the long tail of comparatively low-frequency branches
    level3_groups2 = []
    if level3_groups:
        max_count = level3_groups[0][1]
        level3_groups = [row for row in level3_groups
                         if row[1] > max_count * 0.1]
        for parent, count, child_nodes in level3_groups:
            max_count = child_nodes[0][1]
            child_nodes = [g for g in child_nodes if g[1] > max_count * 0.1]
            level3_groups2.append((parent, count, child_nodes,))

    return total, level3_groups2
示例#20
0
"""
Binomials - Manages indexing and lookup of binomial terms (animals and plants)
"""

import os
import csv
from collections import defaultdict, Counter

import lex.oed.thesaurus.thesaurusdb as tdb
from pickler.sensemanager import PickleLoader
#from utils.tracer import trace_class

living_world_id = 8835
living_world_node = tdb.get_thesclass(living_world_id)
life_branches = (22501, 29205, 17709)  # plant, animal, microorganism


class Binomials(object):
    index = {'binomials': {}, 'genera': {}, }

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            self.__dict__[k] = v
        try:
            self.resources_dir
        except AttributeError:
            pass
        else:
            dir = os.path.join(self.resources_dir, 'taxonomy')
            self.raw_files = {
                'binomials': os.path.join(dir, 'binomials_raw.csv'),