def _np_adjuster(self, model, ratio_set): """ Where a set consists of NN and NP, make sure that the NN is not scoring artificially high. If the NN's calculated score (based on the ratio already derived) is higher than its predicted frequency (based on size), then the ratio is recalculated from the NN's predicted frequency. """ ratio_set = adjust_to_unity(ratio_set) ngram_total = self.ngram.frequency('1970-2000') nn_freq_calculated = ngram_total * ratio_set['NN'] nn_freq_predicted = model['NN'].predicted_frequency() if nn_freq_predicted < nn_freq_calculated: nn_revised_ratio = nn_freq_predicted / ngram_total ratio_set = {'NN': nn_revised_ratio, 'NP': 1 - nn_revised_ratio} return ratio_set
def find_ratios(self, wordclasses, year): """ Derive the appropriate set of ratios for each decade. """ ratios = {} # Shortcut in case of just a single wordclass if len(self.wordclass_model.full_set_of_wordclasses()) <= 1: for w in wordclasses: ratios[w] = 1.0 / len(wordclasses) for lex_item in self.lex_items: lex_item.wordclass_method = 'singleton' return ratios else: if self.calibrator: calibrated_value = self.calibrator.calibrate(year) self.wordclass_model.inject_calibration(calibrated_value) for w in wordclasses: ratios[w] = self.wordclass_model.pos_ratio(w) if not 'NP' in ratios and self.wordclass_model.pos_ratio('NP') > 0: ratios['NP'] = self.wordclass_model.pos_ratio('NP') return adjust_to_unity(ratios)
def _set_partofspeech_ratios(self): """ Establish ratios for specific parts of speech (lowest level of the wordclass model, below base wordclasses) Note that OEC lempos probabilities can't be used here, since the OEC lempos tables are not granular enough (they only give probabilities for base wordclasses, not for specific parts of speech) """ for group in self.wordclass_model.model().values(): for base in group.model().values(): method_type = None ratio_set = dict() if len(base.model()) == 1: for pos in base.model().keys(): ratio_set[pos] = 1.0 method_type = 'singleton' if not method_type: for corpus in ('oec', 'bnc'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.covers(base.full_set_of_wordclasses())): for pos in base.model().keys(): ratio_set[pos] = probability_set.ratio(pos) method_type = corpus break if not method_type: for pos, item in base.model().items(): ratio_set[pos] = item.predicted_frequency() method_type = 'predictions' if ('NP' in base.model() and 'NN' in base.model() and len(base.model().keys()) == 2): ratio_set = self._np_adjuster(base.model(), ratio_set) ratio_set = adjust_to_unity(ratio_set) base.set_ratios(ratio_set, method_type)
def _set_group_ratios(self): """ Set ratios for main groups Based on measured or predicted frequencies. Groups are either 'core' (NN + VB + JJ) or 'other' (everything else) """ method_type = None if len(self.wordclass_model.model()) == 1: ratio_set = {grp: 1.0 for grp in self.wordclass_model.model().keys()} method_type = 'singleton' if not method_type: probability_set = self.corpus_probability_sets['bnc'] if (probability_set and self.wordclass_model.groupset() == probability_set.groupset()): ratio_set = {grp: probability_set.group_ratios()[grp] for grp in self.wordclass_model.groupset()} method_type = 'bnc' if not method_type: probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and self.wordclass_model.groupset() == probability_set.groupset() and probability_set.covers(self.wordclass_model.base_set_of_wordclasses(), base=True) and probability_set.sum_ratios(self.wordclass_model.base_set_of_wordclasses()) > 0.9): ratio_set = {grp: probability_set.group_ratios()[grp] for grp in self.wordclass_model.groupset()} method_type = 'oeclempos' if not method_type: ratio_set = {pos: item.predicted_frequency() for pos, item in self.wordclass_model.model().items()} ratio_set = _crosscheck(ratio_set, self.wordclass_model.model()) method_type = 'predictions' ratio_set = adjust_to_unity(ratio_set) self.wordclass_model.set_ratios(ratio_set, method_type)
def __init__(self, line): columns = line.strip().split('\t') self.word = columns[0] self.fpm = float(columns[1]) self.parts = defaultdict(lambda: 0) for p in columns[2:]: pos, percentage = p.split('=') self.parts[pos] += float(percentage)/100 # Special handling of interjections - we take the interjection out # of the equation, but keep separate note of the ratio given to the # interjection originally. This supports adjustments made for # interjections further down the line if 'UH' in self.parts: self.interjection_ratio = self.parts['UH'] if self.interjection_ratio > 0.99: # Safeguard - make the ratio slightly less than 1, so that # there'll at least be *something* left for other # parts of speech self.interjection_ratio = 0.99 del self.parts['UH'] self.parts = adjust_to_unity(self.parts) else: self.interjection_ratio = 0
def _crosscheck(ratios, model_dict): """ Check that the predicted ratios are not completely out of line with the naive ratios we'd derive by looking at the weighted size of the blocks being compared. If they *are* out of line, we switch to a more naive approach, which just predicts frequency in proportion to weighted size. This is only done in cases where there's a significant difference in size, or where one of the sizes is very small (c.2 quotations); this is the area where the reliability of predicted frequency tends to break down. """ trace = False # Bug out if there's only a single category if len(ratios) == 1: return ratios # Bug out if there are awkward wordclasses involved - since, # for example, we don't necessarily expect VBZ to be more frequent # than NNS just because it's in a larger entry. wordclasses = set() for group in model_dict.values(): [wordclasses.add(wc) for wc in group.full_set_of_wordclasses()] if wordclasses.intersection(AWKWARD_CLASSES): return ratios ratios = adjust_to_unity(ratios) sizes = [(pos, item.summed_weighted_size()) for pos, item in model_dict.items()] sizes.sort(key=lambda i: i[1], reverse=True) try: sizes_ratio = sizes[0][1] / sizes[1][1] except ZeroDivisionError: sizes_ratio = sizes[0][1] / 0.5 largest = sizes[0][0] next_largest = sizes[1][0] # Bug out if the size ratio of largest to next largest is not big # enough to be indicative if sizes_ratio > 3: pass elif sizes[1][1] < 2 and sizes_ratio > 2: pass else: return ratios # Bug out if predicted ratios are already different enough if ratios[largest] / ratios[next_largest] > sizes_ratio: return ratios # Switch to naive ratios, based on weighted size rather than # predicted frequency naive_ratios = {pos: item.summed_weighted_size() for pos, item in model_dict.items()} naive_ratios = adjust_to_unity(naive_ratios) if trace: print('------------------------------------------------------') try: print(list(model_dict.values())[0].form()) except IndexError: print('UNKNOWN') for group in model_dict.values(): print(group.full_set_of_wordclasses()) print('-------------------------------------------------------') print(sizes) print(ratios) print('---->') print(naive_ratios) ratios = naive_ratios return ratios
def _set_base_ratios(self): """ Set ratios for base wordclasses within each group. Based on measured or predicted frequencies. """ for group in self.wordclass_model.model().values(): method_type = None ratio_set = {} # No need to bother when there is only one wordclass (singleton) if len(group.model()) == 1: for wc in group.model().keys(): ratio_set[wc] = 1.0 method_type = 'singleton' if not method_type: # Take ratios from OEC/BNC pos, if it's available and covers # the right set of wordclasses for corpus in ('bnc', 'oec'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.covers(group.base_set_of_wordclasses(), base=True)): for wc in group.model().keys(): ratio_set[wc] = probability_set.base_ratios()[wc] method_type = corpus break if not method_type: # Take ratios from OEC/BNC pos, if it's available and covers # *nearly* the right set of wordclasses. If a minor wordclass # is not covered, use an estimate for this. for corpus in ('bnc', 'oec'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.almost_covers(group.base_set_of_wordclasses())): missing = probability_set.almost_covers(group.base_set_of_wordclasses()) est = self._estimate_missing(missing=missing, corpus=corpus, model=group.model()) if est is not None: # Set the ratios of the wordclasses that *are* covered for wc in group.base_set_of_wordclasses(): if wc != missing: ratio_set[wc] = probability_set.base_ratios()[wc] # Use estimate as the ratio of the missing wordclass ratio_set[missing] = est method_type = corpus break if not method_type: # Take ratios from OEC lempos, if it's available and covers # the right set of wordclasses probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and probability_set.covers(group.base_set_of_wordclasses(), base=True) and not group.is_verblike()): for wc in group.model().values(): ratio_set[wc.wordclass] =\ probability_set.sum_subcategories(list(wc.model().keys())) method_type = 'oeclempos' if not method_type: # Take ratios from OEC lempos, if it's available and covers # *nearly* the right set of wordclasses. If a minor wordclass # is not covered, use an estimate for this. probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and probability_set.almost_covers(group.base_set_of_wordclasses()) and not group.is_verblike()): missing = probability_set.almost_covers(group.base_set_of_wordclasses()) est = self._estimate_missing(missing=missing, trace=False, corpus='oec', model=group.model()) if est: # Set the ratios of the wordclasses that *are* covered for wc in group.base_set_of_wordclasses(): if wc != missing: ratio_set[wc] = probability_set.base_ratios()[wc] # Use estimate as the ratio of the missing wordclass ratio_set[missing] = est method_type = 'oeclempos' # Fall back on predictions if not method_type: for wc, item in group.model().items(): ratio_set[wc] = item.predicted_frequency() ratio_set = _crosscheck(ratio_set, group.model()) method_type = 'predictions' ratio_set = adjust_to_unity(ratio_set) group.set_ratios(ratio_set, method_type)