示例#1
0
    def _get_feature_vectors(
        cls,
        doc: Document,
        gamma: float,
        tf: Optional[Mapping[Word, float]] = None,
    ) -> List[np.ndarray]:
        word_fdist = FreqDist(doc.words)
        word_pdist = LidstoneProbDist(word_fdist, gamma)

        vecs = []
        for para in doc:
            for i, sent in enumerate(para):
                vec = []
                # Sentence position in paragraph
                if i == 0:
                    vec.append(1.)
                elif i == len(para) - 1:
                    vec.append(2. if len(para) == 2 else 3.)
                else:
                    vec.append(2.)
                # Number of terms
                vec.append(math.log(len(sent) + 1))
                # Probability of terms in document
                vec.append(sum(math.log(word_pdist.prob(w)) for w in sent))
                # Probability of terms in a baseline document
                if tf is not None:
                    vec.append(sum(math.log(tf[w]) for w in sent if w in tf))
                vecs.append(np.array(vec))
        return vecs
示例#2
0
    def train(self, labeled_featuresets):
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        for featureset, label in labeled_featuresets:
            label_freqdist[label] += 1
            for fname, fval in featureset.items():
                feature_freqdist[label, fname][fval] += 1
                feature_values[fname].add(fval)
                fnames.add(fname)

        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                if num_samples - count > 0:
                    feature_freqdist[label, fname][None] += num_samples - count
                    feature_values[fname].add(None)

        label_probdist = LidstoneProbDist(label_freqdist, 0, bins=None)

        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = LidstoneProbDist(freqdist,
                                        0,
                                        bins=len(feature_values[fname]))
            feature_probdist[label, fname] = probdist

        return self(label_probdist, feature_probdist)
示例#3
0
 def __init__(self, fd, *args, **kwargs):
     LidstoneProbDist.__init__(self, fd, 0.01, args[-1])
     samples = fd.samples()
     self._probs = dict(zip([0]*len(samples), samples))
     self._logprobs = dict(zip([0]*len(samples), samples))        
     for sample in samples:
         self._logprobs[sample] = LidstoneProbDist.logprob(self, sample)
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """

        # prepare data
        # transform a list of lists of tuples to list of tuples
        train_data = [item for sublist in train_data for item in sublist]

        # the data will be a list in form of (tag, word). Then we will use it to count frequency of word given
        # tag which will be used for emission probability estimations.
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # the data object should be a list of tuples of conditions and observations
        # in our case the tuples should be of the form (tag,word) where words are lowercased
        data = [(tag, word.lower()) for (word, tag) in train_data]

        # compute the emission model
        # compute a Conditional Frequency Distribution for words given their tags using our data
        emission_FD = ConditionalFreqDist(data)

        # Compute the Conditional Probability Distribution using the above Conditional Frequency Distribution.
        # Use LidstoneProbDist estimator.
        #self.emission_PD = ConditionalProbDist(emission_FD, LidstoneProbDist, 0.01, bin)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(emission_FD.keys())

        return self.emission_PD, self.states
示例#5
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # Prepare the data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = [(tag, word.lower()) for pairs in train_data
                for (word, tag) in pairs]

        # Compute the emission model
        emission_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)

        for tag, word in data:
            if tag not in self.states:
                self.states.append(tag)

        return self.emission_PD, self.states
示例#6
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # Prepare the data
        data = []
        tags = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        for s in train_data:
            start = ["<s>"]
            start.extend([tag for (word, tag) in s])
            start.extend(["</s>"])
            tags.extend(start)

        for i in range(len(tags) - 1):
            data.append((tags[i], tags[i + 1]))

        # Compute the transition model
        transition_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD,
                                                 lidstone_estimator)

        return self.transition_PD
示例#7
0
def train_unsupervised(labeled_sents, unlabeled_sents, max_iterations=3):
    symbols = unique_list(word for sent in labeled_sents for word, tag in sent)
    # Extend symbols with those in the unlabelled set
    symbols = unique_list(symbols + unique_list(word
                                                for sent in unlabeled_sents
                                                for word in sent))
    tag_set = unique_list(tag for sent in labeled_sents for word, tag in sent)

    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
    print("Supervised training for initialization ({} sentences)".format(
        len(labeled_sents)))
    hmm = trainer.train_supervised(
        labeled_sents,
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    # The unlabeled sentences are expected to have tags, which are ignored
    unlabeled_sents = [[(word, None) for word in sent]
                       for sent in unlabeled_sents]
    print(
        "Unsupervised training ({} sentences) for up to {} iterations".format(
            len(unlabeled_sents), max_iterations))
    hmm = trainer.train_unsupervised(unlabeled_sents,
                                     model=hmm,
                                     max_iterations=max_iterations,
                                     verbose=True)
    return hmm
示例#8
0
def generate(stars, n):
  

print 'Parsing reviews json into frequency distributions.'
raw_reviews = open(os.path.join(input_dir, "yelp_academic_dataset_review.json"))
# Parses a review object
for raw_review in raw_reviews:
  count += 1
  if count % update_freq == 0:
    print 'Parsing review number:',  count

  review_json = json.loads(raw_review)
  stars = int(review_json['stars'])
  text = review_json['text'].lower()

  if num_training > 0: 
    if count < num_training:
      tokens = nltk.tokenize.word_tokenize(text)
      words[stars].apend(tokens)
    else:
      break
  else:
    tokens = nltk.tokenize.word_tokenize(text)
    words[stars].apend(tokens)
    
print 'Training ngram models'
for x in xrange(1, 6):
  est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
  models[x].append(NgramModel(3, words[x], estimator=est))
示例#9
0
def lidstone_cond_freq(processed_freq, norm_len, k=.1):
    """
    Apply Lidstone to a ConditionalFreq() object
    """

    factory = lambda fd: LidstoneProbDist(fd, k, norm_len)
    return ConditionalProbDist(processed_freq, factory)
示例#10
0
    def train(self):
        for sequence in self.labeled_sequence:
            lasts = None
            for token in sequence:
                state = token[1]
                symbol = token[0]
                if symbol not in self.symbols:
                    self.symbols.append(symbol)
                if lasts is None:
                    self.init[state] += 1
                else:
                    self.transition_bigram[lasts][state] += 1
                    self.transition_unigram[state] += 1
                self.emission[state][symbol] += 1
                lasts = state

        N = len(self.states)
        st = LidstoneProbDist(self.init, gamma=self.gammaPrior)
        # We've modified the emission labeled data by replacing low frequency words
        # with ones in hanlde_lowfreq_words. We smooth the zero probabilities of
        # p[state][symbol] with add-K smoothing.
        em = ConditionalProbDist(self.emission,
                                 LidstoneProbDist,
                                 gamma=self.gammaEmission,
                                 bins=len(self.symbols))
        tr = ConditionalProbDist(self.transition_bigram,
                                 InterpolatedProbDist,
                                 alpha1=self.alpha1,
                                 alpha2=self.alpha2,
                                 unigram_freq=self.transition_unigram)
        return st, em, tr
示例#11
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        for idx, s in enumerate(train_data):
            train_data[idx].insert(0, ('<s>', '<s>'))
            train_data[idx].insert(-1, ('<\s>', '<\s>'))

        tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1))
                         for s in train_data)
        data = itertools.chain.from_iterable(tagGenerators)

        transition_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda emission_FD: LidstoneProbDist(
            emission_FD, 0.01,
            emission_FD.B() + 1)

        self.transition_PD = ConditionalProbDist(transition_FD,
                                                 lidstone_estimator)

        return self.transition_PD
示例#12
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        data = []
        #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data]
        for sent in train_data:
            for (word, tag) in sent:
                data.append((tag, word.lower()))
                self.states.append(tag)

        emission_FD = ConditionalFreqDist(data)

        lidstone_estimator = lambda emission_FD: LidstoneProbDist(
            emission_FD, 0.01,
            emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(set(self.states))

        return self.emission_PD, self.states
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel
示例#14
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        # Repack the train_data to list(tuple(tag, lowercase_word)) format
        tagged_words = chain(train_data)
        data = [(tag, word.lower()) for (word, tag) in tagged_words]

        # Train the emission probilistic model
        emission_FD = ConditionalFreqDist(data)
        # Reseal the lidston function with gamma 0.01 and a proper bin number
        lidstone_PD = lambda FD: LidstoneProbDist(
            FD, gamma=0.01, bins=FD.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_PD)
        # Store the tags as states
        self.states = emission_FD.conditions()

        return self.emission_PD, self.states
示例#15
0
def demo_pos_bw(test=10,
                supervised=20,
                unsupervised=10,
                verbose=True,
                max_iterations=5):
    # demonstrates the Baum-Welch algorithm in POS tagging

    print()
    print("Baum-Welch demo for POS tagging")
    print()

    print('Training HMM (supervised, %d sentences)...' % supervised)

    sentences, tag_set, symbols = load_pos(test + supervised + unsupervised)

    symbols = set()
    for sentence in sentences:
        for token in sentence:
            symbols.add(token[_TEXT])

    trainer = HiddenMarkovModelTrainer(tag_set, list(symbols))
    hmm = trainer.train_supervised(
        sentences[test:test + supervised],
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    hmm.test(sentences[:test], verbose=verbose)

    print('Training (unsupervised, %d sentences)...' % unsupervised)
    # it's rather slow - so only use 10 samples by default
    unlabeled = _untag(sentences[test + supervised:])
    hmm = trainer.train_unsupervised(unlabeled,
                                     model=hmm,
                                     max_iterations=max_iterations)
    hmm.test(sentences[:test], verbose=verbose)
def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
            break
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
        words.append(nltk.word_tokenize(sentence))
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        data = []
        data_start = [[("<s>", "<s>")] + w + [("</s>", "</s>")]
                      for w in train_data]
        words = [[word for (tag, word) in wordlist] for wordlist in data_start]

        for word in words:
            new_data = list(zip(word[:-1], word[1:]))
            data.append(new_data)

        data_new = []
        for a in data:
            for b in a:
                data_new.append(b)

        # ConditionalProbDist with a LidstoneProbDist estimator
        transition_FD = ConditionalFreqDist(data_new)
        lidstone = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD, lidstone)
        return self.transition_PD
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # TODO prepare data
        data = []
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        for i in train_data:
            data_p = list(map(lambda a: (a[1], a[0].lower()), i))
            data.extend(data_p)

        # ConditionalProbDist with a LidstoneProbDist estimator
        emission_FD = ConditionalFreqDist(data)
        lidstone = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone)
        # Get states for each word
        self.states = list(set([tag for (tag, word) in data]))
        self.states.sort()

        return self.emission_PD, self.states
示例#19
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # raise NotImplementedError('HMM.transition_model')

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>

        # Padding sentenses with <s> at beginning and </s> at end
        padded = [[('<s>', '<s>')] + s + [('</s>', '</s>')]
                  for s in train_data]

        # Reform the data into list[tuple(tag_(i),tag_(i+1))]
        data = chain([[(s[i][1], s[i+1][1]) \
                               for i in range(len(s)-1)] for s in padded])

        # Compute the transition model
        # Reseal the lidston function with gamma 0.01 and a proper bin number
        lidstone_PD = lambda FD: LidstoneProbDist(
            FD, gamma=0.01, bins=FD.B() + 1)
        transition_FD = ConditionalFreqDist(data)

        # Store the trainned conditinoal probabilistic distribution model
        self.transition_PD = ConditionalProbDist(transition_FD, lidstone_PD)

        return self.transition_PD
示例#20
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        new_data = []
        for x in range(len(train_data)):
            new_data += train_data[x]

        data = [(tag, word.lower()) for (word, tag) in new_data]
        # print(data[:20])
        # COMPLETED compute the emission model
        emission_FD = ConditionalFreqDist(data)
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = emission_FD.keys()
        #print(self.states[0])

        return self.emission_PD, self.states
示例#21
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        data = []
        for sent in train_data:
            data.append(("<s>", sent[0][1]))  #start symbol
            for i in range(len(sent) - 1):
                data.append((sent[i][1], sent[i + 1][1]))
            data.append((sent[len(sent) - 1][1], "</s>"))  #end symbol

        transition_FD = ConditionalFreqDist(data)
        #same estimator used for emission_model
        est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01,
                                                     transition_FD.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD, est)

        return self.transition_PD
示例#22
0
    def _train(cls, labeled_sequence, test_sequence=None,
    		        unlabeled_sequence=None, **kwargs):
        transform = kwargs.get('transform', IdentityTransform())
        if isinstance(transform, types.FunctionType):
            transform = LambdaTransform(transform)
        elif \
        not isinstance(transform, HiddenMarkovModelTaggerTransformI):
            raise

        estimator = kwargs.get('estimator', lambda fd, bins: \
    	                                    LidstoneProbDist(fd, 0.1, bins))

        labeled_sequence = LazyMap(transform.transform, labeled_sequence)
        symbols = list(set(word for sent in labeled_sequence
            for word, tag in sent))
        tag_set = list(set(tag for sent in labeled_sequence
            for word, tag in sent))

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs,
                  hmm._priors, transform=transform)

        if test_sequence:
            hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        if unlabeled_sequence:
            max_iterations = kwargs.get('max_iterations', 5)
            hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm,
                max_iterations=max_iterations)
            if test_sequence:
                hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        return hmm
示例#23
0
    def build_lm(self, corpus, order=2):
        """
        Create a reasonable English language model on your training data.
        """

        self._lm_order = order
        if order > 0:
            tokens = []
            sentence_count = 0
            for e_sent, f_sent in corpus:
                if sentence_count % 100 == 0:
                    print("LM Sentence %i" % sentence_count)
                    sentence_count += 1

                # Each sentence starts with an empty string
                tokens += [''] + e_sent

            estimator = lambda fdist, bins: \
                LidstoneProbDist(fdist, 0.1)
            self._lm = NgramModel(order,
                                  tokens,
                                  pad_left=False,
                                  pad_right=False,
                                  estimator=estimator)
        else:
            self._lm = StubLanguageModel()
示例#24
0
def HMM(data, symbols, tag_set, verbose=True):
    '''
    NB(data,symbols,tag_set,verbose)->model,prediction,report(dict). 
    Keyword arguments:
        data: see preprocessing.py
        symbols: list of the input class labels
        tag_set: list of the output class labels
    for data structure see preprocessing.py
    '''
    trainer = hmm.HiddenMarkovModelTrainer(tag_set, symbols)
    tagger = trainer.train_supervised(
        data.y_train,
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
    )

    y_pred = []
    for sentence in data.x_test:
        y_pred.append(tagger.tag(sentence))

    #unlike the test or evaluate function from the same suit, this requires
    #a list of symbols, not tuples of symbols and tags
    y_pred = [[tup[1] for tup in sentence] for sentence in y_pred]

    print('HMM Results:')
    print(gen_rep_flat(data, y_pred, False))
    return tagger, y_pred, gen_rep_flat(data, y_pred, True)
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
    #   print(train_data)
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        
        # I want to make train_data into one list of tagged_words with type:(tuple(str,str))
        data = []
        for x in train_data:
        #    data += [ (tag, word.lower() if word.isalpha() else (tag, word)) for (word, tag) in x]  # lower case and check word
            data += [ (tag, word.lower() )for (word, tag) in x]  # lower case

        # TODO compute the emission model
        emission_FD = ConditionalFreqDist(data)
        # need Lidstone bin parameter
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        
        self.states = emission_FD.keys()

        return self.emission_PD, self.states
示例#26
0
def _estimator(fdist, **estimator_kwargs):
    """
    Default estimator function using a LidstoneProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    return LidstoneProbDist(fdist, 0.001, **estimator_kwargs)
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.
        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        data = []
        for sent in train_data:  #for each sentence
            for tuples in sent:  #for each pair of (word,tag) in every sentence
                data.append(
                    (tuples[1], tuples[0].lower()))  #list of tuples(tag,word)

        emission_FD = ConditionalFreqDist(data)
        # this is the estiamtor used for probability distribution
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = list(emission_FD.keys())
        #print(self.states)

        return self.emission_PD, self.states
示例#28
0
def _estimator(fdist, *estimator_args, **estimator_kwargs):
    """
    Default estimator function using a SimpleGoodTuringProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    #    return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
    return LidstoneProbDist(fdist, 1, 10)
示例#29
0
    def __init__(self, in_text):
        self.est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')

        self.tokenized_text = self.tokenizer.tokenize(in_text)
        self.content_model = nltk.model.ngram.NgramModel(3,
                                                         self.tokenized_text,
                                                         estimator=self.est)
        self.text = ''
示例#30
0
def build_model(word_string):
	words = word_string.replace('\n',' ').replace('\t',' ')
	#split_delim = "|".join(["\%s" % s for s in string.punctuation + " "])
	#words = re.split(split_delim,words)
	words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words)
	words = [w.strip() for w in words]
	est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
	model = NgramModel(6, words, estimator=est)
	return model
示例#31
0
def demo():
    from nltk.corpus import brown
    from nltk.probability import LidstoneProbDist, WittenBellProbDist
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    lm = NgramModel(3, brown.words(categories='news'), estimator)
    print lm
    #print lm.entropy(sent)
    text = lm.generate(100)
    import textwrap
    print '\n'.join(textwrap.wrap(' '.join(text)))
示例#32
0
def lidstoneProbDist(olddf):
    """
    Use nltk to create probdist
    """
    #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
    #https://github.com/tuzzeg/detect_insults/blob/master/README.md
    print "Creating LidStone Probdist...",nltk.__version__
    tutto=[]
    
    #olddf = olddf.ix[random.sample(olddf.index, 10)]
    olddf=pd.DataFrame(olddf['body'])
    
    print type(olddf)
    for ind in olddf.index:
	  print ind
	  row=[]
	  row.append(ind)
	  text=olddf.ix[ind,'body']
	  tokens=word_tokenize(text)
	  #print tokens
	  
	  t_fd = FreqDist(tokens)
	  pdist = LidstoneProbDist(t_fd,0.1)
	  print pdist.samples()
	  #for tok in tokens:
	  #    print pdist[3][tok]
	  #t_fd.plot(cumulative=False)
	  raw_input("HITKEY")
	  row=tokens
	  #print tagged
	  #print len(tagged)

	  tutto.append(row)
    newdf=pd.DataFrame(tutto).set_index(0)
    newdf.columns=taglist
    print newdf.head(20)
    print newdf.describe()
    newdf.to_csv("../stumbled_upon/data/lidstone.csv")
示例#33
0
 def __init__(self, fd, bins, *factory_args):
     LidstoneProbDist.__init__(self, fd, 0.1, bins)
示例#34
0
 def prob(self, sample):
     if sample not in self._probs:
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
     return self._probs.get(sample)