def _combined_analyses(self, word_dediac, prefix_analyses, stem_analyses, suffix_analyses): combined = deque() for p in itertools.product(prefix_analyses, stem_analyses): prefix_cat = p[0][0] prefix_feats = p[0][1] stem_cat = p[1][0] stem_feats = p[1][1] if stem_cat in self._db.prefix_stem_compat[prefix_cat]: for suffix_cat, suffix_feats in suffix_analyses: if ((stem_cat not in self._db.stem_suffix_compat) or (prefix_cat not in self._db.prefix_suffix_compat) or (suffix_cat not in self._db.stem_suffix_compat[stem_cat]) or (suffix_cat not in self._db.prefix_suffix_compat[prefix_cat])): continue merged = merge_features(self._db, prefix_feats, stem_feats, suffix_feats) merged['stem'] = stem_feats['diac'] merged['stemcat'] = stem_cat merged_dediac = dediac_ar(merged['diac']) if word_dediac.replace(u'\u0640', '') != merged_dediac: merged['source'] = 'spvar' combined.append(merged) return combined
def _combined_backoff_analyses(self, stem, word_dediac, prefix_analyses, stem_analyses, suffix_analyses): combined = deque() for p in itertools.product(prefix_analyses, stem_analyses): prefix_cat = p[0][0] prefix_feats = p[0][1] stem_cat = p[1][0] stem_feats = copy.copy(p[1][1]) if stem_cat in self._db.prefix_stem_compat[prefix_cat]: for suffix_cat, suffix_feats in suffix_analyses: if ((suffix_cat not in self._db.stem_suffix_compat[stem_cat]) or (prefix_cat not in self._db.prefix_suffix_compat or suffix_cat not in self._db.prefix_suffix_compat[prefix_cat])): continue if (self._backoff_action == 'PROP' and 'NOUN_PROP' not in stem_feats['bw']): continue stem_feats['bw'] = _NOAN_RE.sub(stem, stem_feats['bw']) stem_feats['diac'] = _NOAN_RE.sub(stem, stem_feats['diac']) stem_feats['lex'] = _NOAN_RE.sub(stem, stem_feats['lex']) merged = merge_features(self._db, prefix_feats, stem_feats, suffix_feats) merged['stem'] = stem_feats['diac'] merged['stemcat'] = stem_cat merged['source'] = 'backoff' merged['gloss'] = stem_feats['gloss'] combined.append(merged) return combined
def generate(self, lemma, feats): """Generate analyses for a given lemma and a given set of features. Args: lemma (:obj:`str`): Lemma to generate from. feats (:obj:`dict`): Dictionary of features. Must contain 'pos' feature. See :doc:`/reference/calima_star_features` for more information on features and their values. Returns: :obj:`list` of :obj:`dict`: List of generated analyses. See :doc:`/reference/calima_star_features` for more information on features and their values. Raises: :obj:`~camel_tools.calima_star.errors.InvalidGeneratorFeature`: If a feature is given that is not defined in database. :obj:`~camel_tools.calima_star.errors.InvalidGeneratorFeatureValue`: If an invalid value is given to a feature or if 'pos' feature is not defined. """ if lemma not in self._db.lemma_hash: return [] for feat in feats: if feat not in self._db.defines: raise InvalidGeneratorFeature(feat) elif (self._db.defines[feat] is not None and feats[feat] not in self._db.defines[feat]): raise InvalidGeneratorFeatureValue(feat, feats[feat]) if 'pos' not in feats or feats['pos'] not in self._db.defines['pos']: raise InvalidGeneratorFeatureValue('pos', feats.get('pos', None)) feats = copy.copy(feats) default = self._db.defaults[feats['pos']] default_feat_set = frozenset(default.keys()) feat_set = frozenset(feats.keys()) if not feat_set.issubset(default_feat_set): return [] # Set default values for undefined feats for feat in ['prc0', 'prc1', 'prc2', 'prc3', 'enc0']: if feat not in feats and feat in default: feats[feat] = default[feat] stem_feats_list = self._db.lemma_hash[lemma] analyses = collections.deque() for stem_feats in stem_feats_list: if 'vox' in feats and stem_feats['vox'] != feats['vox']: continue if 'rat' in feats and stem_feats['rat'] != feats['rat']: continue if 'pos' in feats and stem_feats['pos'] != feats['pos']: continue ignore_stem = False for feat in ['prc0', 'prc1', 'prc2', 'prc3', 'enc0']: if feat not in feats: continue if (feat in stem_feats and stem_feats[feat] != '0' and feats[feat] != stem_feats[feat]): ignore_stem = True break if ignore_stem: continue prefix_cats = self._db.stem_prefix_compat[stem_feats['stemcat']] suffix_cats = self._db.stem_suffix_compat[stem_feats['stemcat']] for prefix_cat in prefix_cats: if prefix_cat not in self._db.prefix_cat_hash: continue prefix_feats_list = self._db.prefix_cat_hash[prefix_cat] for prefix_feats in prefix_feats_list: ignore_prefix = False for feat in ['prc0', 'prc1', 'prc2', 'prc3']: if feat not in feats: continue if ((feats[feat] != '0' and feat not in prefix_feats and stem_feats.get(feat, '0') != feats[feat]) or (feat in prefix_feats and feats[feat] != prefix_feats[feat])): ignore_prefix = True break if ignore_prefix: continue for suffix_cat in suffix_cats: if suffix_cat not in self._db.suffix_cat_hash: continue suffix_feats_list = ( self._db.suffix_cat_hash[suffix_cat]) for suffix_feats in suffix_feats_list: if ((prefix_cat not in self._db.prefix_suffix_compat) or (suffix_cat not in self._db.prefix_suffix_compat[prefix_cat])): continue ignore_suffix = False for feat in ['enc0']: if feat not in feats: continue if ((feats[feat] != '0' and feat not in suffix_feats and stem_feats.get(feat, '0') != feats[feat]) or (feat in suffix_feats and feats[feat] != suffix_feats[feat])): ignore_suffix = True break if ignore_suffix: continue merged = merge_features(self._db, prefix_feats, stem_feats, suffix_feats) ignore_analysis = False for feat in feats.keys(): if (feat in merged and merged[feat] != feats[feat]): ignore_analysis = True break if not ignore_analysis: analyses.append(merged) return list(analyses)