def _get_feature_vectors( cls, doc: Document, gamma: float, tf: Optional[Mapping[Word, float]] = None, ) -> List[np.ndarray]: word_fdist = FreqDist(doc.words) word_pdist = LidstoneProbDist(word_fdist, gamma) vecs = [] for para in doc: for i, sent in enumerate(para): vec = [] # Sentence position in paragraph if i == 0: vec.append(1.) elif i == len(para) - 1: vec.append(2. if len(para) == 2 else 3.) else: vec.append(2.) # Number of terms vec.append(math.log(len(sent) + 1)) # Probability of terms in document vec.append(sum(math.log(word_pdist.prob(w)) for w in sent)) # Probability of terms in a baseline document if tf is not None: vec.append(sum(math.log(tf[w]) for w in sent if w in tf)) vecs.append(np.array(vec)) return vecs
def train(self, labeled_featuresets): label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() for featureset, label in labeled_featuresets: label_freqdist[label] += 1 for fname, fval in featureset.items(): feature_freqdist[label, fname][fval] += 1 feature_values[fname].add(fval) fnames.add(fname) for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() if num_samples - count > 0: feature_freqdist[label, fname][None] += num_samples - count feature_values[fname].add(None) label_probdist = LidstoneProbDist(label_freqdist, 0, bins=None) feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = LidstoneProbDist(freqdist, 0, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return self(label_probdist, feature_probdist)
def __init__(self, fd, *args, **kwargs): LidstoneProbDist.__init__(self, fd, 0.01, args[-1]) samples = fd.samples() self._probs = dict(zip([0]*len(samples), samples)) self._logprobs = dict(zip([0]*len(samples), samples)) for sample in samples: self._logprobs[sample] = LidstoneProbDist.logprob(self, sample) self._probs[sample] = LidstoneProbDist.prob(self, sample)
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # prepare data # transform a list of lists of tuples to list of tuples train_data = [item for sublist in train_data for item in sublist] # the data will be a list in form of (tag, word). Then we will use it to count frequency of word given # tag which will be used for emission probability estimations. # Don't forget to lowercase the observation otherwise it mismatches the test data # the data object should be a list of tuples of conditions and observations # in our case the tuples should be of the form (tag,word) where words are lowercased data = [(tag, word.lower()) for (word, tag) in train_data] # compute the emission model # compute a Conditional Frequency Distribution for words given their tags using our data emission_FD = ConditionalFreqDist(data) # Compute the Conditional Probability Distribution using the above Conditional Frequency Distribution. # Use LidstoneProbDist estimator. #self.emission_PD = ConditionalProbDist(emission_FD, LidstoneProbDist, 0.01, bin) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(emission_FD.keys()) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # Prepare the data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [(tag, word.lower()) for pairs in train_data for (word, tag) in pairs] # Compute the emission model emission_FD = ConditionalFreqDist(data) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) for tag, word in data: if tag not in self.states: self.states.append(tag) return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # Prepare the data data = [] tags = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> for s in train_data: start = ["<s>"] start.extend([tag for (word, tag) in s]) start.extend(["</s>"]) tags.extend(start) for i in range(len(tags) - 1): data.append((tags[i], tags[i + 1])) # Compute the transition model transition_FD = ConditionalFreqDist(data) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def train_unsupervised(labeled_sents, unlabeled_sents, max_iterations=3): symbols = unique_list(word for sent in labeled_sents for word, tag in sent) # Extend symbols with those in the unlabelled set symbols = unique_list(symbols + unique_list(word for sent in unlabeled_sents for word in sent)) tag_set = unique_list(tag for sent in labeled_sents for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) print("Supervised training for initialization ({} sentences)".format( len(labeled_sents))) hmm = trainer.train_supervised( labeled_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) # The unlabeled sentences are expected to have tags, which are ignored unlabeled_sents = [[(word, None) for word in sent] for sent in unlabeled_sents] print( "Unsupervised training ({} sentences) for up to {} iterations".format( len(unlabeled_sents), max_iterations)) hmm = trainer.train_unsupervised(unlabeled_sents, model=hmm, max_iterations=max_iterations, verbose=True) return hmm
def generate(stars, n): print 'Parsing reviews json into frequency distributions.' raw_reviews = open(os.path.join(input_dir, "yelp_academic_dataset_review.json")) # Parses a review object for raw_review in raw_reviews: count += 1 if count % update_freq == 0: print 'Parsing review number:', count review_json = json.loads(raw_review) stars = int(review_json['stars']) text = review_json['text'].lower() if num_training > 0: if count < num_training: tokens = nltk.tokenize.word_tokenize(text) words[stars].apend(tokens) else: break else: tokens = nltk.tokenize.word_tokenize(text) words[stars].apend(tokens) print 'Training ngram models' for x in xrange(1, 6): est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) models[x].append(NgramModel(3, words[x], estimator=est))
def lidstone_cond_freq(processed_freq, norm_len, k=.1): """ Apply Lidstone to a ConditionalFreq() object """ factory = lambda fd: LidstoneProbDist(fd, k, norm_len) return ConditionalProbDist(processed_freq, factory)
def train(self): for sequence in self.labeled_sequence: lasts = None for token in sequence: state = token[1] symbol = token[0] if symbol not in self.symbols: self.symbols.append(symbol) if lasts is None: self.init[state] += 1 else: self.transition_bigram[lasts][state] += 1 self.transition_unigram[state] += 1 self.emission[state][symbol] += 1 lasts = state N = len(self.states) st = LidstoneProbDist(self.init, gamma=self.gammaPrior) # We've modified the emission labeled data by replacing low frequency words # with ones in hanlde_lowfreq_words. We smooth the zero probabilities of # p[state][symbol] with add-K smoothing. em = ConditionalProbDist(self.emission, LidstoneProbDist, gamma=self.gammaEmission, bins=len(self.symbols)) tr = ConditionalProbDist(self.transition_bigram, InterpolatedProbDist, alpha1=self.alpha1, alpha2=self.alpha2, unigram_freq=self.transition_unigram) return st, em, tr
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ for idx, s in enumerate(train_data): train_data[idx].insert(0, ('<s>', '<s>')) train_data[idx].insert(-1, ('<\s>', '<\s>')) tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1)) for s in train_data) data = itertools.chain.from_iterable(tagGenerators) transition_FD = ConditionalFreqDist(data) lidstone_estimator = lambda emission_FD: LidstoneProbDist( emission_FD, 0.01, emission_FD.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ data = [] #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data] for sent in train_data: for (word, tag) in sent: data.append((tag, word.lower())) self.states.append(tag) emission_FD = ConditionalFreqDist(data) lidstone_estimator = lambda emission_FD: LidstoneProbDist( emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(set(self.states)) return self.emission_PD, self.states
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences # Repack the train_data to list(tuple(tag, lowercase_word)) format tagged_words = chain(train_data) data = [(tag, word.lower()) for (word, tag) in tagged_words] # Train the emission probilistic model emission_FD = ConditionalFreqDist(data) # Reseal the lidston function with gamma 0.01 and a proper bin number lidstone_PD = lambda FD: LidstoneProbDist( FD, gamma=0.01, bins=FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_PD) # Store the tags as states self.states = emission_FD.conditions() return self.emission_PD, self.states
def demo_pos_bw(test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5): # demonstrates the Baum-Welch algorithm in POS tagging print() print("Baum-Welch demo for POS tagging") print() print('Training HMM (supervised, %d sentences)...' % supervised) sentences, tag_set, symbols = load_pos(test + supervised + unsupervised) symbols = set() for sentence in sentences: for token in sentence: symbols.add(token[_TEXT]) trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) hmm = trainer.train_supervised( sentences[test:test + supervised], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) hmm.test(sentences[:test], verbose=verbose) print('Training (unsupervised, %d sentences)...' % unsupervised) # it's rather slow - so only use 10 samples by default unlabeled = _untag(sentences[test + supervised:]) hmm = trainer.train_unsupervised(unlabeled, model=hmm, max_iterations=max_iterations) hmm.test(sentences[:test], verbose=verbose)
def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> data = [] data_start = [[("<s>", "<s>")] + w + [("</s>", "</s>")] for w in train_data] words = [[word for (tag, word) in wordlist] for wordlist in data_start] for word in words: new_data = list(zip(word[:-1], word[1:])) data.append(new_data) data_new = [] for a in data: for b in a: data_new.append(b) # ConditionalProbDist with a LidstoneProbDist estimator transition_FD = ConditionalFreqDist(data_new) lidstone = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, lidstone) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # TODO prepare data data = [] # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences for i in train_data: data_p = list(map(lambda a: (a[1], a[0].lower()), i)) data.extend(data_p) # ConditionalProbDist with a LidstoneProbDist estimator emission_FD = ConditionalFreqDist(data) lidstone = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone) # Get states for each word self.states = list(set([tag for (tag, word) in data])) self.states.sort() return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # raise NotImplementedError('HMM.transition_model') # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> # Padding sentenses with <s> at beginning and </s> at end padded = [[('<s>', '<s>')] + s + [('</s>', '</s>')] for s in train_data] # Reform the data into list[tuple(tag_(i),tag_(i+1))] data = chain([[(s[i][1], s[i+1][1]) \ for i in range(len(s)-1)] for s in padded]) # Compute the transition model # Reseal the lidston function with gamma 0.01 and a proper bin number lidstone_PD = lambda FD: LidstoneProbDist( FD, gamma=0.01, bins=FD.B() + 1) transition_FD = ConditionalFreqDist(data) # Store the trainned conditinoal probabilistic distribution model self.transition_PD = ConditionalProbDist(transition_FD, lidstone_PD) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences new_data = [] for x in range(len(train_data)): new_data += train_data[x] data = [(tag, word.lower()) for (word, tag) in new_data] # print(data[:20]) # COMPLETED compute the emission model emission_FD = ConditionalFreqDist(data) est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = emission_FD.keys() #print(self.states[0]) return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> data = [] for sent in train_data: data.append(("<s>", sent[0][1])) #start symbol for i in range(len(sent) - 1): data.append((sent[i][1], sent[i + 1][1])) data.append((sent[len(sent) - 1][1], "</s>")) #end symbol transition_FD = ConditionalFreqDist(data) #same estimator used for emission_model est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01, transition_FD.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, est) return self.transition_PD
def _train(cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs): transform = kwargs.get('transform', IdentityTransform()) if isinstance(transform, types.FunctionType): transform = LambdaTransform(transform) elif \ not isinstance(transform, HiddenMarkovModelTaggerTransformI): raise estimator = kwargs.get('estimator', lambda fd, bins: \ LidstoneProbDist(fd, 0.1, bins)) labeled_sequence = LazyMap(transform.transform, labeled_sequence) symbols = list(set(word for sent in labeled_sequence for word, tag in sent)) tag_set = list(set(tag for sent in labeled_sequence for word, tag in sent)) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=transform) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) if unlabeled_sequence: max_iterations = kwargs.get('max_iterations', 5) hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm, max_iterations=max_iterations) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) return hmm
def build_lm(self, corpus, order=2): """ Create a reasonable English language model on your training data. """ self._lm_order = order if order > 0: tokens = [] sentence_count = 0 for e_sent, f_sent in corpus: if sentence_count % 100 == 0: print("LM Sentence %i" % sentence_count) sentence_count += 1 # Each sentence starts with an empty string tokens += [''] + e_sent estimator = lambda fdist, bins: \ LidstoneProbDist(fdist, 0.1) self._lm = NgramModel(order, tokens, pad_left=False, pad_right=False, estimator=estimator) else: self._lm = StubLanguageModel()
def HMM(data, symbols, tag_set, verbose=True): ''' NB(data,symbols,tag_set,verbose)->model,prediction,report(dict). Keyword arguments: data: see preprocessing.py symbols: list of the input class labels tag_set: list of the output class labels for data structure see preprocessing.py ''' trainer = hmm.HiddenMarkovModelTrainer(tag_set, symbols) tagger = trainer.train_supervised( data.y_train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), ) y_pred = [] for sentence in data.x_test: y_pred.append(tagger.tag(sentence)) #unlike the test or evaluate function from the same suit, this requires #a list of symbols, not tuples of symbols and tags y_pred = [[tup[1] for tup in sentence] for sentence in y_pred] print('HMM Results:') print(gen_rep_flat(data, y_pred, False)) return tagger, y_pred, gen_rep_flat(data, y_pred, True)
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # print(train_data) # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # I want to make train_data into one list of tagged_words with type:(tuple(str,str)) data = [] for x in train_data: # data += [ (tag, word.lower() if word.isalpha() else (tag, word)) for (word, tag) in x] # lower case and check word data += [ (tag, word.lower() )for (word, tag) in x] # lower case # TODO compute the emission model emission_FD = ConditionalFreqDist(data) # need Lidstone bin parameter lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = emission_FD.keys() return self.emission_PD, self.states
def _estimator(fdist, **estimator_kwargs): """ Default estimator function using a LidstoneProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. return LidstoneProbDist(fdist, 0.001, **estimator_kwargs)
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: #for each sentence for tuples in sent: #for each pair of (word,tag) in every sentence data.append( (tuples[1], tuples[0].lower())) #list of tuples(tag,word) emission_FD = ConditionalFreqDist(data) # this is the estiamtor used for probability distribution est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = list(emission_FD.keys()) #print(self.states) return self.emission_PD, self.states
def _estimator(fdist, *estimator_args, **estimator_kwargs): """ Default estimator function using a SimpleGoodTuringProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. # return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs) return LidstoneProbDist(fdist, 1, 10)
def __init__(self, in_text): self.est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+') self.tokenized_text = self.tokenizer.tokenize(in_text) self.content_model = nltk.model.ngram.NgramModel(3, self.tokenized_text, estimator=self.est) self.text = ''
def build_model(word_string): words = word_string.replace('\n',' ').replace('\t',' ') #split_delim = "|".join(["\%s" % s for s in string.punctuation + " "]) #words = re.split(split_delim,words) words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words) words = [w.strip() for w in words] est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(6, words, estimator=est) return model
def demo(): from nltk.corpus import brown from nltk.probability import LidstoneProbDist, WittenBellProbDist estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, brown.words(categories='news'), estimator) print lm #print lm.entropy(sent) text = lm.generate(100) import textwrap print '\n'.join(textwrap.wrap(' '.join(text)))
def lidstoneProbDist(olddf): """ Use nltk to create probdist """ #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf #https://github.com/tuzzeg/detect_insults/blob/master/README.md print "Creating LidStone Probdist...",nltk.__version__ tutto=[] #olddf = olddf.ix[random.sample(olddf.index, 10)] olddf=pd.DataFrame(olddf['body']) print type(olddf) for ind in olddf.index: print ind row=[] row.append(ind) text=olddf.ix[ind,'body'] tokens=word_tokenize(text) #print tokens t_fd = FreqDist(tokens) pdist = LidstoneProbDist(t_fd,0.1) print pdist.samples() #for tok in tokens: # print pdist[3][tok] #t_fd.plot(cumulative=False) raw_input("HITKEY") row=tokens #print tagged #print len(tagged) tutto.append(row) newdf=pd.DataFrame(tutto).set_index(0) newdf.columns=taglist print newdf.head(20) print newdf.describe() newdf.to_csv("../stumbled_upon/data/lidstone.csv")
def __init__(self, fd, bins, *factory_args): LidstoneProbDist.__init__(self, fd, 0.1, bins)
def prob(self, sample): if sample not in self._probs: self._probs[sample] = LidstoneProbDist.prob(self, sample) return self._probs.get(sample)