def get_grammeme_classes(parses): """ Given a list of ``ParseInfo`` structures, return a dict with its grammemes, classified:: { TokenInfo.UNIVOCAL: {set of univocal grammemes}, TokenInfo.AMBIG: {set of possible grammemes}, TokenInfo.DISCARDED: {set of discarded grammemes}, } """ all_grammemes = defaultdict(set) tag_grammemes = defaultdict(list) for p in parses: gr = tag2grammemes(p.tag) tag_grammemes[p.state].append((p.tag, gr)) all_grammemes[p.state] |= gr if not all_grammemes[ParseInfo.UNIVOCAL]: all_grammemes[ParseInfo.UNIVOCAL] = all_grammemes[ParseInfo.AMBIG].copy() for tag, gr in tag_grammemes[ParseInfo.AMBIG]: all_grammemes[ParseInfo.UNIVOCAL] &= gr all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.UNIVOCAL] all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.AMBIG] all_grammemes[ParseInfo.AMBIG] -= all_grammemes[ParseInfo.UNIVOCAL] return dict(all_grammemes)
def get_grammeme_classes(parses): """ Given a list of ``ParseInfo`` structures, return a dict with its grammemes, classified:: { TokenInfo.UNIVOCAL: {set of univocal grammemes}, TokenInfo.AMBIG: {set of possible grammemes}, TokenInfo.DISCARDED: {set of discarded grammemes}, } """ all_grammemes = defaultdict(set) tag_grammemes = defaultdict(list) for p in parses: gr = tag2grammemes(p.tag) tag_grammemes[p.state].append((p.tag, gr)) all_grammemes[p.state] |= gr if not all_grammemes[ParseInfo.UNIVOCAL]: all_grammemes[ParseInfo.UNIVOCAL] = all_grammemes[ ParseInfo.AMBIG].copy() for tag, gr in tag_grammemes[ParseInfo.AMBIG]: all_grammemes[ParseInfo.UNIVOCAL] &= gr all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.UNIVOCAL] all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.AMBIG] all_grammemes[ParseInfo.AMBIG] -= all_grammemes[ParseInfo.UNIVOCAL] return dict(all_grammemes)
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora if logger is None: logger = logging.getLogger(__name__) class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) disambig_words = list( with_progress( _disambiguated_words(reader), "Reading disambiguated words from corpus" ) ) disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words") ambiguous_words = [ (w, gr) for (w, gr) in ( (w.lower(), tag2grammemes(t)) for (w, t) in disambig_words if len(morph.tag(w)) > 1 ) if gr != set(['UNKN']) ] logger.info("Computing P(t|w)") def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora if logger is None: logger = logging.getLogger(__name__) class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) disambig_words = list( with_progress(_disambiguated_words(reader), "Reading disambiguated words from corpus")) disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words") ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t)) for (w, t) in disambig_words if len(morph.tag(w)) > 1) if gr != set(['UNKN'])] logger.info("Computing P(t|w)") def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def estimate_conditional_tag_probability(morph, corpus_filename): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) ambiguous_words = ( (w.lower(), tag2grammemes(t)) for (w, t) in _disambiguated_words(reader) if len(morph.tag(w)) > 1 ) ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words if gr != set(['UNKN'])) def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def estimate_conditional_tag_probability(morph, corpus_filename): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) ambiguous_words = ((w.lower(), tag2grammemes(t)) for (w, t) in _disambiguated_words(reader) if len(morph.tag(w)) > 1) ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words if gr != set(['UNKN'])) def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def grammemes(self): return tag2grammemes(self.tag)
def grammemes(self): return tag2grammemes(self.tag)