예제 #1
0
    def _np_adjuster(self, model, ratio_set):
        """
        Where a set consists of NN and NP, make sure that the NN is
        not scoring artificially high.

        If the NN's calculated score (based on the ratio already derived)
        is higher than its predicted frequency (based on size), then the
        ratio is recalculated from the NN's predicted frequency.
        """
        ratio_set = adjust_to_unity(ratio_set)
        ngram_total = self.ngram.frequency('1970-2000')
        nn_freq_calculated = ngram_total * ratio_set['NN']
        nn_freq_predicted = model['NN'].predicted_frequency()

        if nn_freq_predicted < nn_freq_calculated:
            nn_revised_ratio = nn_freq_predicted / ngram_total
            ratio_set = {'NN': nn_revised_ratio, 'NP': 1 - nn_revised_ratio}
        return ratio_set
예제 #2
0
 def find_ratios(self, wordclasses, year):
     """
     Derive the appropriate set of ratios for each decade.
     """
     ratios = {}
     # Shortcut in case of just a single wordclass
     if len(self.wordclass_model.full_set_of_wordclasses()) <= 1:
         for w in wordclasses:
             ratios[w] = 1.0 / len(wordclasses)
         for lex_item in self.lex_items:
             lex_item.wordclass_method = 'singleton'
         return ratios
     else:
         if self.calibrator:
             calibrated_value = self.calibrator.calibrate(year)
             self.wordclass_model.inject_calibration(calibrated_value)
         for w in wordclasses:
             ratios[w] = self.wordclass_model.pos_ratio(w)
         if not 'NP' in ratios and self.wordclass_model.pos_ratio('NP') > 0:
             ratios['NP'] = self.wordclass_model.pos_ratio('NP')
         return adjust_to_unity(ratios)
예제 #3
0
    def _set_partofspeech_ratios(self):
        """
        Establish ratios for specific parts of speech (lowest
        level of the wordclass model, below base wordclasses)

        Note that OEC lempos probabilities can't be used here, since
        the OEC lempos tables are not granular enough (they only give
        probabilities for base wordclasses, not for specific parts
        of speech)
        """
        for group in self.wordclass_model.model().values():
            for base in group.model().values():
                method_type = None
                ratio_set = dict()
                if len(base.model()) == 1:
                    for pos in base.model().keys():
                        ratio_set[pos] = 1.0
                    method_type = 'singleton'

                if not method_type:
                    for corpus in ('oec', 'bnc'):
                        probability_set = self.corpus_probability_sets[corpus]
                        if (probability_set and
                                probability_set.covers(base.full_set_of_wordclasses())):
                            for pos in base.model().keys():
                                ratio_set[pos] = probability_set.ratio(pos)
                            method_type = corpus
                            break

                if not method_type:
                    for pos, item in base.model().items():
                        ratio_set[pos] = item.predicted_frequency()
                    method_type = 'predictions'
                    if ('NP' in base.model() and
                            'NN' in base.model() and
                            len(base.model().keys()) == 2):
                        ratio_set = self._np_adjuster(base.model(), ratio_set)

                ratio_set = adjust_to_unity(ratio_set)
                base.set_ratios(ratio_set, method_type)
예제 #4
0
    def _set_group_ratios(self):
        """
        Set ratios for main groups
        Based on measured or predicted frequencies.

        Groups are either 'core' (NN + VB + JJ) or 'other' (everything else)
        """
        method_type = None
        if len(self.wordclass_model.model()) == 1:
            ratio_set = {grp: 1.0 for grp in self.wordclass_model.model().keys()}
            method_type = 'singleton'

        if not method_type:
            probability_set = self.corpus_probability_sets['bnc']
            if (probability_set and
                    self.wordclass_model.groupset() == probability_set.groupset()):
                ratio_set = {grp: probability_set.group_ratios()[grp]
                             for grp in self.wordclass_model.groupset()}
                method_type = 'bnc'

        if not method_type:
            probability_set = self.corpus_probability_sets['oec_lempos']
            if (probability_set and
                    self.wordclass_model.groupset() == probability_set.groupset() and
                    probability_set.covers(self.wordclass_model.base_set_of_wordclasses(), base=True) and
                    probability_set.sum_ratios(self.wordclass_model.base_set_of_wordclasses()) > 0.9):
                ratio_set = {grp: probability_set.group_ratios()[grp]
                             for grp in self.wordclass_model.groupset()}
                method_type = 'oeclempos'

        if not method_type:
            ratio_set = {pos: item.predicted_frequency() for pos, item
                         in self.wordclass_model.model().items()}
            ratio_set = _crosscheck(ratio_set, self.wordclass_model.model())
            method_type = 'predictions'

        ratio_set = adjust_to_unity(ratio_set)
        self.wordclass_model.set_ratios(ratio_set, method_type)
예제 #5
0
    def __init__(self, line):
        columns = line.strip().split('\t')
        self.word = columns[0]
        self.fpm = float(columns[1])
        self.parts = defaultdict(lambda: 0)
        for p in columns[2:]:
            pos, percentage = p.split('=')
            self.parts[pos] += float(percentage)/100

        # Special handling of interjections -  we take the interjection out
        # of the equation, but keep separate note of the ratio given to the
        # interjection originally. This supports adjustments made for
        # interjections further down the line
        if 'UH' in self.parts:
            self.interjection_ratio = self.parts['UH']
            if self.interjection_ratio > 0.99:
                # Safeguard - make the ratio slightly less than 1, so that
                # there'll at least be *something* left for other
                # parts of speech
                self.interjection_ratio = 0.99
            del self.parts['UH']
            self.parts = adjust_to_unity(self.parts)
        else:
            self.interjection_ratio = 0
예제 #6
0
def _crosscheck(ratios, model_dict):
    """
    Check that the predicted ratios are not completely out of line with
    the naive ratios we'd derive by looking at the weighted size of the
    blocks being compared.

    If they *are* out of line, we switch to a more naive approach, which
    just predicts frequency in proportion to weighted size.

    This is only done in cases where there's a significant difference
    in size, or where one of the sizes is very small (c.2 quotations);
    this is the area where the reliability of predicted frequency
    tends to break down.
    """
    trace = False

    # Bug out if there's only a single category
    if len(ratios) == 1:
        return ratios

    # Bug out if there are awkward wordclasses involved - since,
    # for example, we don't necessarily expect VBZ to be more frequent
    # than NNS just because it's in a larger entry.
    wordclasses = set()
    for group in model_dict.values():
        [wordclasses.add(wc) for wc in group.full_set_of_wordclasses()]
    if wordclasses.intersection(AWKWARD_CLASSES):
        return ratios

    ratios = adjust_to_unity(ratios)
    sizes = [(pos, item.summed_weighted_size())
             for pos, item in model_dict.items()]
    sizes.sort(key=lambda i: i[1], reverse=True)
    try:
        sizes_ratio = sizes[0][1] / sizes[1][1]
    except ZeroDivisionError:
        sizes_ratio = sizes[0][1] / 0.5
    largest = sizes[0][0]
    next_largest = sizes[1][0]

    # Bug out if the size ratio of largest to next largest is not big
    # enough to be indicative
    if sizes_ratio > 3:
        pass
    elif sizes[1][1] < 2 and sizes_ratio > 2:
        pass
    else:
        return ratios

    # Bug out if predicted ratios are already different enough
    if ratios[largest] / ratios[next_largest] > sizes_ratio:
        return ratios

    # Switch to naive ratios, based on weighted size rather than
    #  predicted frequency
    naive_ratios = {pos: item.summed_weighted_size() for pos, item
                    in model_dict.items()}
    naive_ratios = adjust_to_unity(naive_ratios)

    if trace:
        print('------------------------------------------------------')
        try:
            print(list(model_dict.values())[0].form())
        except IndexError:
            print('UNKNOWN')
        for group in model_dict.values():
            print(group.full_set_of_wordclasses())
        print('-------------------------------------------------------')
        print(sizes)
        print(ratios)
        print('---->')
        print(naive_ratios)

    ratios = naive_ratios
    return ratios
예제 #7
0
    def _set_base_ratios(self):
        """
        Set ratios for base wordclasses within each group.
        Based on measured or predicted frequencies.
        """
        for group in self.wordclass_model.model().values():
            method_type = None
            ratio_set = {}

            # No need to bother when there is only one wordclass (singleton)
            if len(group.model()) == 1:
                for wc in group.model().keys():
                    ratio_set[wc] = 1.0
                method_type = 'singleton'

            if not method_type:
                # Take ratios from OEC/BNC pos, if it's available and covers
                #  the right set of wordclasses
                for corpus in ('bnc', 'oec'):
                    probability_set = self.corpus_probability_sets[corpus]
                    if (probability_set and
                            probability_set.covers(group.base_set_of_wordclasses(), base=True)):
                        for wc in group.model().keys():
                            ratio_set[wc] = probability_set.base_ratios()[wc]
                        method_type = corpus
                        break

            if not method_type:
                # Take ratios from OEC/BNC pos, if it's available and covers
                #  *nearly* the right set of wordclasses. If a minor wordclass
                #  is not covered, use an estimate for this.
                for corpus in ('bnc', 'oec'):
                    probability_set = self.corpus_probability_sets[corpus]
                    if (probability_set and
                            probability_set.almost_covers(group.base_set_of_wordclasses())):
                        missing = probability_set.almost_covers(group.base_set_of_wordclasses())
                        est = self._estimate_missing(missing=missing,
                                                     corpus=corpus,
                                                     model=group.model())
                        if est is not None:
                            # Set the ratios of the wordclasses that *are* covered
                            for wc in group.base_set_of_wordclasses():
                                if wc != missing:
                                    ratio_set[wc] = probability_set.base_ratios()[wc]
                            # Use estimate as the ratio of the missing wordclass
                            ratio_set[missing] = est
                            method_type = corpus
                            break

            if not method_type:
                # Take ratios from OEC lempos, if it's available and covers
                #  the right set of wordclasses
                probability_set = self.corpus_probability_sets['oec_lempos']
                if (probability_set and
                        probability_set.covers(group.base_set_of_wordclasses(), base=True) and
                        not group.is_verblike()):
                    for wc in group.model().values():
                        ratio_set[wc.wordclass] =\
                            probability_set.sum_subcategories(list(wc.model().keys()))
                    method_type = 'oeclempos'

            if not method_type:
                # Take ratios from OEC lempos, if it's available and covers
                #  *nearly* the right set of wordclasses. If a minor wordclass
                #  is not covered, use an estimate for this.
                probability_set = self.corpus_probability_sets['oec_lempos']
                if (probability_set and
                        probability_set.almost_covers(group.base_set_of_wordclasses()) and
                        not group.is_verblike()):

                    missing = probability_set.almost_covers(group.base_set_of_wordclasses())
                    est = self._estimate_missing(missing=missing,
                                                 trace=False,
                                                 corpus='oec',
                                                 model=group.model())
                    if est:
                        # Set the ratios of the wordclasses that *are* covered
                        for wc in group.base_set_of_wordclasses():
                            if wc != missing:
                                ratio_set[wc] = probability_set.base_ratios()[wc]
                        # Use estimate as the ratio of the missing wordclass
                        ratio_set[missing] = est
                        method_type = 'oeclempos'

            # Fall back on predictions
            if not method_type:
                for wc, item in group.model().items():
                    ratio_set[wc] = item.predicted_frequency()
                ratio_set = _crosscheck(ratio_set, group.model())
                method_type = 'predictions'

            ratio_set = adjust_to_unity(ratio_set)
            group.set_ratios(ratio_set, method_type)