def global_pmi(self, ngram): ngram_probability = self.global_probability(ngram) # use iterable also for one element members_probability = product( self.global_probability([s]) for s in ngram if s.has_meaning_alone()) return math.log(ngram_probability / members_probability)
def statistical_mutual_information_confidence(self): """ Number of occurrences of the ngram / number of ngrams possible / probability of each member of the ngram. """ if self._confidences["statistical_mutual_information"] is None: if len(self) == 1: return 1.0#TODO : find better way for 1-grams... ngram_possible = len(self.text) - len(self) + 1 members_probability = product([1.0 * s.count/len(self.text) for s in self]) self._confidences["statistical_mutual_information"] = \ math.log(1.0 * self.count / ngram_possible / members_probability) return self._confidences["statistical_mutual_information"]
def statistical_mutual_information_confidence(self): """ Number of occurrences of the ngram / number of ngrams possible / probability of each member of the ngram. """ if self._confidences["statistical_mutual_information"] is None: if len(self) == 1: return 1.0 # TODO : find better way for 1-grams... ngram_possible = len(self.text) - len(self) + 1 members_probability = product( [1.0 * s.count / len(self.text) for s in self]) self._confidences["statistical_mutual_information"] = \ math.log(1.0 * self.count / ngram_possible / members_probability) return self._confidences["statistical_mutual_information"]
def heuristical_mutual_information_confidence(self): """ Return the probability of all the terms of the ngram to appear together. The matter is to understand the dependance or independance of the terms. If just some terms appears out of this context, it may be normal (for exemple, a name, which appaers sometimes with both firstname and lastname and sometimes with just lastname). And if these terms appears many many times, but some others appears just in this context, the number doesn't count. If NO term appears out of this context, with have a good probability for a collocation. If each term appears out of this context, and specialy if this occurs often, we can doubt of this collocation candidate. Do we may consider the stop_words ? This may affect negativly and positivly the main confidence. """ if self._confidences["heuristical_mutual_information"] is None: #We test just from interessant stemms, but we keep original position candidates = [(k, v) for k, v in enumerate(self) if v.has_meaning()] alone_count = {} if len(self) == 1: return 1 # Just one word, PMI doesn't make sense if len(candidates) == 0: return 0.1 for position, stemm in candidates: alone_count[position] = 0 neighbours = [(s, p - position) for p, s in enumerate(self) if not s is stemm] for tkn in stemm.occurrences: if not tkn.is_neighbor(neighbours): alone_count[position] += 1 res = [v for k, v in alone_count.items()] if sum(res) == 0: return 3 * len(self) # We trust this collocation elif 0 in res: # Almost one important term appears just in this context return 2 else: #We don't know, but not so confident... #The more the terms appears alone, the less we are confident #So the smaller is the coef return product( [2.0 * len(self) / (len(self) + v) for v in res]) return self._confidences["heuristical_mutual_information"]
def heuristical_mutual_information_confidence(self): """ Return the probability of all the terms of the ngram to appear together. The matter is to understand the dependance or independance of the terms. If just some terms appears out of this context, it may be normal (for exemple, a name, which appaers sometimes with both firstname and lastname and sometimes with just lastname). And if these terms appears many many times, but some others appears just in this context, the number doesn't count. If NO term appears out of this context, with have a good probability for a collocation. If each term appears out of this context, and specialy if this occurs often, we can doubt of this collocation candidate. Do we may consider the stop_words ? This may affect negativly and positivly the main confidence. """ if self._confidences["heuristical_mutual_information"] is None: #We test just from interessant stemms, but we keep original position candidates = [(k, v) for k, v in enumerate(self) if v.has_meaning()] alone_count = {} if len(self) == 1: return 1 # Just one word, PMI doesn't make sense if len(candidates) == 0: return 0.1 for position, stemm in candidates: alone_count[position] = 0 neighbours = [(s, p - position) for p, s in enumerate(self) if not s is stemm] for tkn in stemm.occurrences: if not tkn.is_neighbor(neighbours): alone_count[position] += 1 res = [v for k, v in alone_count.items()] if sum(res) == 0: return 3 * len(self) # We trust this collocation elif 0 in res: # Almost one important term appears just in this context return 2 else: #We don't know, but not so confident... #The more the terms appears alone, the less we are confident #So the smaller is the coef return product([2.0 * len(self) / (len(self) + v) for v in res]) return self._confidences["heuristical_mutual_information"]
def global_pmi(self, ngram): ngram_probability = self.global_probability(ngram) # use iterable also for one element members_probability = product(self.global_probability([s]) for s in ngram if s.has_meaning_alone()) return math.log(ngram_probability / members_probability)