Python MLEProbDist 예제들, nltk.probability.MLEProbDist Python 예제들

예제 #1

0

파일 보기

    def refine(self,
               labeled_featuresets,
               entropy_cutoff,
               depth_cutoff,
               support_cutoff,
               binary=False,
               feature_values=None,
               verbose=False):
        if len(labeled_featuresets) <= support_cutoff: return
        if self._fname is None: return
        if depth_cutoff <= 0: return
        for fval in self._decisions:
            fval_featuresets = [(featureset, label)
                                for (featureset, label) in labeled_featuresets
                                if featureset.get(self._fname) == fval]

            label_freqs = FreqDist(label
                                   for (featureset, label) in fval_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._decisions[fval] = DecisionTreeClassifier.train(
                    fval_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
        if self._default is not None:
            default_featuresets = [
                (featureset, label)
                for (featureset, label) in labeled_featuresets
                if featureset.get(self._fname) not in self._decisions
            ]
            label_freqs = FreqDist(label for (featureset,
                                              label) in default_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._default = DecisionTreeClassifier.train(
                    default_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)

예제 #2

0

파일 보기

 def __init__(self, freqdist, bins=None):
     MLEProbDist.__init__(self, freqdist, bins)
     self._probarray = np.zeros((len(freqdist), ))
     self._probmap = {}
     for i, item in enumerate(freqdist.keys()):
         self._probarray[i] = freqdist.freq(item)
         self._probmap[i] = item

예제 #3

0

파일 보기

파일: generator.py 프로젝트: tocubed/imitare

    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(FreqDist(
                            {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma}))
                        tag_choice = tag_probdist.generate() # Randomly select the tag
                        lemma_choice = tag_to_lemma[tag_choice] # Set the lemma
                        break
                size -= 1 # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

예제 #4

0

파일 보기

파일: epidemic.py 프로젝트: haohu1/seattle_flu

def add_individual(number_individuals, res_address, diagnosis):
    total_individuals = []
    new_address = res_address.sample(number_individuals).to_dict('records')
    for idx in xrange(number_individuals):
        diagnosis_freq_dist = FreqDist(diagnosis)
        diagnosis_prob_dist = MLEProbDist(diagnosis_freq_dist)
        diagnosis_random = diagnosis_prob_dist.generate()
        full_address = new_address[idx]['ADDR_FULL'] + '|' + new_address[idx]['CTYNAME'] + '|' + new_address[idx]['ZIP5']
        gender, age = get_gender_age(new_address[idx])
        new_individual = {'Date_Inf': current_date, 'Gender': gender, 'Age': age, 'Census_Tract': new_address[idx]['GEOID'], 'Address':full_address, 'LON':new_address[idx]['LON'], 'LAT':new_address[idx]['LAT'], 'Diagnosis': diagnosis_random}
        total_individuals.append(new_individual)
    return pd.DataFrame.from_records(total_individuals)

예제 #5

0

파일 보기

파일: epidemic.py 프로젝트: haohu1/seattle_flu

def get_gender_age(full_address):
    GEOID = full_address['GEOID']
    try:
        age_gender_dist = KC_age_gender.loc[[GEOID]].loc[:,'M0-4':'F85-120']
        age_gender_freq_dist = FreqDist(age_gender_dist)
        age_gender_prob_dist_age_gender = MLEProbDist(age_gender_freq_dist)
        age_gender_random = age_gender_prob_dist_age_gender.generate()
        gender = age_gender_random[0]
        age = age_gender_random[1:]
        return gender, age
    except:
        return np.nan, np.nan

예제 #6

0

파일 보기

def gen_sent(ngram):

    global lis

    # "n" contains the ngram number
    n = lis[1]
    #number of required sentences is stored in sent_num
    sent_num = lis[2]
    i = 0
    for i in range(sent_num):
        j = True

        # we are using this window to go through the sentence with n-1 previous
        # words stored in the window
        window = []
        sent = ""
        for size in range(n - 1):
            window.append('<start>')
        while j == True:
            tup_win = tuple(window)
            if tup_win not in ngram.keys():
                sys.exit("We don't have a start line")

            # FreqDist and MLEProbDist function will transform the frequencies to probabilities
            # by performing (item freq/ sum of frequencies)
            freq_dist = FreqDist(ngram[tup_win])

            #prob_dist.generate() will take in the freq-distance and generate a random token
            # according to the distribution
            prob_dist = MLEProbDist(freq_dist)
            next_w = prob_dist.generate()

            #the following condition is used to detect the end of line
            if (next_w == "." or next_w == "?" or next_w == "!"):
                j = False
                sent += next_w
                continue

            #We'd like to make sure the apostrophe token has no space before or after it...
            # ... as well as the begining of the line
            elif (next_w == "m" or next_w == "s" or next_w == "re"
                  or next_w == "," or next_w == "’" or next_w == "ve"
                  or next_w == "t" or tup_win[-1] == '<start>'):
                sent += next_w
            else:
                sent += " %s" % next_w

            #moving the window forward by popping and appending
            window.pop(0)
            window.append(next_w)

        print("\nSentence %s:\n%s" % (i + 1, sent))

예제 #7

0

파일 보기

파일: ngram.py 프로젝트: gpaulbr/ERelp

def _estimator(fdist, bins):
    """
    Default estimator function using an MLEProbDist.
    """
    # can't be an instance method of NgramModel as they 
    # can't be pickled either.
    return MLEProbDist(fdist)

예제 #8

0

파일 보기

파일: decipher.py 프로젝트: Qiutan-Wu/COMP550-Natural-Language-Processing

def TransitionsGenerate(
    AddCorpus, train_p, tagger, estimator
):  #recalculate the transition matrix using train plain text + additional corpus

    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    data = train_p
    data.extend(AddCorpus)
    print(type(data))
    for s in data:
        s = list(s)

    N = len(tagger._states)

    transitions = ConditionalFreqDist()
    for sentence in data:
        lasts = None
        sentence = list(sentence.strip('\n'))
        for character in sentence:
            state = character
            if not lasts is None:
                transitions[lasts][state] += 1
            lasts = state
    A = ConditionalProbDist(transitions, estimator, N)
    return A

예제 #9

0

파일 보기

파일: hmm.py 프로젝트: VinodhSubramanian1193/NLP

    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param labelled_sequences: the training data, a set of
            labelled sequences of observations
        :type labelled_sequences: list
        :param kwargs: may include an 'estimator' parameter, a function taking
            a FreqDist and a number of bins and returning a CProbDistI;
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts is None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)

예제 #10

0

파일 보기

파일: ngram.py 프로젝트: yonas-g/HornMorpho

    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and returns
              a C{ConditionalProbDist}
        """

        self._n = n

        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('', ) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, len(cfd))

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator)

예제 #11

0

파일 보기

def trainModelLM(laplace, symbols, train_output, train_transition):
    extra_set = []
    for i in symbols:
        for j in symbols:
            extra_set.append((i, j))

    transition = suppleText(train_transition)
    initial = []
    output = []
    for i in range(len(train_output)):
        initial.append(train_output[i][0][1])
        for j in range(len(train_output[i])):
            output.append(train_output[i][j])

    if laplace:
        transition += extra_set
        initial += symbols
        output += extra_set
    transition_cfd = ConditionalFreqDist(transition)
    transition_cqd = ConditionalProbDist(transition_cfd, MLEProbDist)
    inital_cfd = FreqDist(initial)
    initial_cqd = MLEProbDist(inital_cfd)
    output_cfd = ConditionalFreqDist(output)
    output_cqd = ConditionalProbDist(output_cfd, MLEProbDist)
    model = hmm.HiddenMarkovModelTagger(symbols=symbols,
                                        states=symbols,
                                        transitions=transition_cqd,
                                        outputs=output_cqd,
                                        priors=initial_cqd)
    return model

예제 #12

0

파일 보기

def plot_word_dist_as_cloud(word_dist, file_name=None, plot=False):
    prob_dist = MLEProbDist(word_dist)
    viz_dict = {}
    for word_tuple in word_dist:
        string = ' '.join(word_tuple)
        viz_dict[string] = prob_dist.prob(word_tuple)

    wordcloud = WordCloud(max_words=100).generate_from_frequencies(viz_dict)
    if file_name != None:
        wordcloud.to_file("img/" + file_name +".png")

    if plot:
        plt.figure()
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()

예제 #13

0

파일 보기

def train_supervised2(trainer,
                      labelled_sequences,
                      plain_sequences,
                      estimator=None):
    _TAG = 1
    _TEXT = 0
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
    known_symbols = set(trainer._symbols)
    known_states = set(trainer._states)

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    # =================code added to supplement transition matrix====================
    for sequence in plain_sequences:
        lasts = None
        for token in sequence:
            if lasts is None:
                pass
            else:
                transitions[lasts][token] += 1
            lasts = token

            if token not in known_states:
                trainer._states.append(token)
                known_states.add(token)
    # ================================end============================================
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[_TAG]
            symbol = token[_TEXT]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                trainer._states.append(state)
                known_states.add(state)

            if symbol not in known_symbols:
                trainer._symbols.append(symbol)
                known_symbols.add(symbol)

    # create probability distributions (with smoothing)
    N = len(trainer._states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(transitions, estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(trainer._symbols))

    return HiddenMarkovModelTagger(trainer._symbols, trainer._states, A, B, pi)

예제 #14

0

파일 보기

 def _generate_one_predicated(self, context, backoff_limit, predicate):
     context = tuple(context)[1 - self._n:]
     choices = self.backoff_search(context, backoff_limit,
                                   predicate)  # Possible tokens
     if choices is not None:
         return MLEProbDist(choices).generate()
     else:
         return None

예제 #15

0

파일 보기

 def _generate_one(self, context, backoff_limit):
     context = tuple(context)[1 - self._n:]
     while not context in self and len(context) >= backoff_limit:
         context = context[1:]
     if context in self:
         return MLEProbDist(
             self[context]).generate()  # Select from possible tokens
     else:
         return None

예제 #16

0

파일 보기

파일: ngram.py 프로젝트: asingh62/NLP-Ngrams

def sentence_generator(gramFreq,numofsentences):
    i = 0
    for  i in range (numofsentences):
        sentenceGen = True
        sentencelist = ()
        generateSentence = ""
        for size in range (int(ngrams)-1):
            sentencelist += ('<start>',)   
        while sentenceGen == True:
            token_dict = {}
            for index, val in ngrams_frequency.items():
                index2 = index[:-1]
                if index2 == sentencelist:
                    token_dict.update({index[-1]: val})

            # generating frequency using the function
            frequencyDistribution = FreqDist(token_dict)

            # generating probability using the function
            probabilityDistribution = MLEProbDist(frequencyDistribution)

            # predicting the next word
            next_word = probabilityDistribution.generate()
            
            # words having ".,?,!"
            if (next_word =="." or next_word == "?" or next_word == "!"):
                sentenceGen = False
                generateSentence += next_word
                continue
            
            # words having , '
            elif (next_word == "," or next_word == "’"):
                generateSentence += next_word
                
            else:
                generateSentence += " %s"%next_word

            if len(sentencelist) != 0 :   
                my_list = list(sentencelist)
                my_list.pop(0)
                my_list.append(next_word)
                sentencelist = tuple(my_list)
        # Display sentences
        print ("\nSentence %s: %s"%(i+1,generateSentence))

예제 #17

0

파일 보기

파일: ngram.py 프로젝트: merinjoy23/Sentence_simulation_using_ngrams

def gen_sentence(ngram):
    global arg
    i = 0
    # n in ngrams
    n = arg[1]
    # number of sentences to generate
    m = arg[2]
    for i in range(m):
        j = True
        table = []
        sentence = ""
        for size in range(n - 1):
            table.append('<START>')
        while j == True:
            tuple_table = tuple(table)
            if tuple_table not in ngram.keys():
                # when start is not available
                sys.exit("No start line!")
            # generating frequency
            frequency = FreqDist(ngram[tuple_table])
            # generating probability
            probability = MLEProbDist(frequency)
            # predicting the next word
            pred_word = probability.generate()

            # words having ".,?,!"
            if (pred_word == "." or pred_word == "?" or pred_word == "!"):
                j = False
                sentence += pred_word
                continue
            # words having , ' or START tag
            elif (pred_word == "," or pred_word == "’"
                  or tuple_table[-1] == '<START>'):
                sentence += pred_word
            else:
                sentence += " %s" % pred_word
            table.pop(0)
            table.append(pred_word)
        # Display sentences
        print("\nSentence %s:\n%s" % (i + 1, sentence))

예제 #18

0

파일 보기

    def __init__(self, corpus, n, estimator=None):
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)
        bi = []
        self._l = []
        for tree in corpus[:n]:
            ts = tree.leaves()
            sent = ['START'] + ts
            bi += nltk.bigrams(sent)
            self._l.append(len(sent))

        cfd = ConditionalFreqDist(bi)
        self._model = ConditionalProbDist(cfd, estimator, len(cfd))

예제 #19

0

파일 보기

    def generate(self, n):
        """
        Generate n words using copied grammar, generated lemmas, and words based on lemmas
        """
        start = random.randint(n, len(self._tags) - n)
        generated_tags = self._tags[
            start:start + n]  # Copy a random section of POS tags for grammar

        # Generate sequence of lemmas based off of grammar
        generated_lemmas = []
        for tag in generated_tags:
            # Search for and choose a lemma with correct tag
            choice = self._lemmas_ngram.choose_word(
                generated_lemmas,
                backoff_limit=2,
                predicate=lambda lemma: lemma in self._tag_lemmas[tag])
            if choice is None:
                # Could not find a good lemma for current POS tag, choose from list
                choice = MLEProbDist(self._tag_lemmas[tag]).generate()
            generated_lemmas.append(choice)

        # Generate sequence of words based off of lemmas and grammar
        generated_words = []
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words,
                backoff_limit=2,
                predicate=lambda word: word in self._tag_lemma_words[
                    (tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

예제 #20

0

파일 보기

def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()
    
    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()    

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())
        
    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p,q)
        print "KL_Divergence for %s = %f" %(c , div)

예제 #21

0

파일 보기

파일: decipher.py 프로젝트: emulhall/COMP550

def train_transitions(labelled_sequences,
                      additional_transitions,
                      estimator=None):
    # default to the MLE estimate
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    # count occurrences of starting states, transitions out of each state
    # and output symbols observed in each state
    known_symbols = []
    known_states = []

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[0]
            symbol = token[1]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                known_states.append(state)

            if symbol not in known_symbols:
                known_symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(known_states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(
        ConditionalFreqDist.__add__(transitions, additional_transitions),
        estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(known_symbols))
    return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)

예제 #22

0

파일 보기

파일: hmm_with_mini_relmin.py 프로젝트: finsqm/MInf

    def doesnt_work(self, y):
        """
		Code adapted from NLTK implementation of supervised training in HMMs
		"""

        estimator = lambda fdist, bins: MLEProbDist(fdist)

        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in y:
            lasts = None
            for state in sequence:
                if lasts is not None:
                    transitions[lasts][state] += 1
                lasts = state

        N = self.number_of_states + 2
        model = ConditionalProbDist(transitions, estimator, N)

        return model

예제 #23

0

파일 보기

파일: WittenBellModel.py 프로젝트: mnadejde/SelectionalPreferences

def main():
  DEBUG =1
  depRelFile=open(sys.argv[1],'r')	#file with dep rel tuples
  ReadDictFromFile(sys.argv[2],lemma_dict) #lemma file
  modelFile = open(sys.argv[3],'w')
  if (len(sys.argv)==5):
    DEBUG = int(sys.argv[4])

  print "---Done loading lemma file---"
 
  print "---Computing CDF....---"
  incompletePairs = ComputeFreqDist(depRelFile)
  print "---Done computing CDF---"
  print "incomplete pairs: ",incompletePairs
  
  if(DEBUG):
  
    print "Info about F(arg)"
    print "unique samples: ", argFD.B()
    print "total seen samples: ", argFD.N()
    print "top arg:", argFD.max()
    print "count for support: ", argFD['support']
    print "Info about CFD(arg|rel,vb)"
    print "unique conditions seen: ", len(argVbRelCFD.conditions())
    print "total seen samples", argVbRelCFD.N()
    top_CFD1 = sorted(argVbRelCFD[('dobj','enjoy')].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all dobj,enjoy: ", argVbRelCFD[('dobj','enjoy')].N()
    print "top dobj for enjoy:\n",top_CFD1
    print "Info about CFD(arg|vb)"
    print "unique conditions seen: ", len(argVbCFD.conditions())
    print "total seen samples", argVbCFD.N()
    top_CFD2 = sorted(argVbCFD['enjoy'].items(),key=operator.itemgetter(1), reverse=True)[:10]
    print "all enjoy: ", argVbCFD['enjoy'].N()
    print "top arg for enjoy:\n",top_CFD2


  print "---Computing MLE PDFs....---"
  argVbRelPDF = ConditionalProbDist(argVbRelCFD,MLEProbDist)
  argVbPDF = ConditionalProbDist(argVbCFD,MLEProbDist)
  argPDF = MLEProbDist(argFD)


    #I'm not sure here Types is equivelent with argVbRelCFD.conditions() or unique condition+arg
    #!!!!! lambda should be for each history P(a|v) T = count of unique (v a) pairs starting with v
    # for each condition v -> sum(CFD[v].B() -> how many unique arguments I've seen after this condition)
   

  print "---Computing Witten-Bell smoothed PDFs....---"

  #for unseen pairs we multiply the backoff_weight with the probability of the backoff model
  #e.g. if  c(rel,vb,arg)=0 and c(vb,arg)>0 then P(arg|rel,vb)=argRelVbPDFWB_backoff_weights[(rel,vb)] * argVbPDFWB[vb].prob(arg)
  argPDFWB, backoff_uniform = ComputeWBArg(argPDF)
  argVbPDFWB, argVbPDFWB_backoff_weights,  countArgVB = ComputeWBVbArg(argVbPDF,argPDFWB)
  argRelVbPDFWB,argRelVbPDFWB_backoff_weights, countRelVbArg = ComputeWBRelVbArg(argVbRelPDF,argVbPDFWB)


  if(DEBUG):
    print "P(support|dobs,enjoy)"
    print argVbRelPDF[('dobj','enjoy')].prob('support')
    print argRelVbPDFWB[('dobj','enjoy')]['support']
    print "No args following (dobj,enjoy)", argVbRelCFD[('dobj','enjoy')].B()
    print "P(support|enjoy)"
    print argVbPDF['enjoy'].prob('support')
    print argVbPDFWB['enjoy']['support']
    print "P(support)"
    print argPDF.prob('support')
    print argPDFWB['support']

  WriteToArpaFormat(modelFile, len(argPDFWB),countArgVB,countRelVbArg,argPDFWB,argVbPDFWB,argRelVbPDFWB, backoff_uniform, argVbPDFWB_backoff_weights, argRelVbPDFWB_backoff_weights)

  if(DEBUG):
    #print sorted(argVbPDFWB['enjoy'],key=operator.itemgetter(1), reverse=True)[:5] #[('enjoy','support')]
    
    for condition in argVbPDFWB.keys()[:10]:
      sum1 = 0
      sum2 = 0
      for prob in argVbPDFWB[condition].values():
        sum1+=prob
      for arg in argVbCFD[condition].items():
        sum2+=argVbPDF[condition].prob(arg[0])
      print "total prob: ", sum1, sum2

    print "P_WB(support|dobj, enjoy)"
    print argRelVbPDFWB[('dobj','enjoy')]['support']

    for condition in argRelVbPDFWB.keys()[:10]:
      sum = 0
      for prob in argRelVbPDFWB[condition].values():
        sum+=prob
      print "total prob: ", sum

예제 #24

0

파일 보기

tokenized_text = len(nltk.sent_tokenize(inputFile))
print(sent_tokenize(inputFile))
print("tokenized text: ", tokenized_text, "\n")

tokenized_text = nltk.word_tokenize(inputFile)
tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()]
print("Lower cased text: ", tokenized_text)
print("Word Count: ", len(tokenized_text), "\n")

freq_dist_uni = nltk.FreqDist(tokenized_text)
print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n",
      "least common 3 words: ",
      freq_dist_uni.most_common()[-3:], "\n")

prob_distArray = []
prob_dist_uni = MLEProbDist(freq_dist_uni)
for s in prob_dist_uni.samples():
    prob_distArray.append(prob_dist_uni.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i])
    i += 1

elep = ELEProbDist(freq_dist_uni)
for s in elep.samples():
    prob_distArray.append(elep.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i], "\n")
    i += 1

예제 #25

0

파일 보기

def main():
    parser = argparse.ArgumentParser(description='Text decipher options')
    parser.add_argument('cipher_folder', help='cipher data folder')
    parser.add_argument('--laplace',
                        '-laplace',
                        action='store_true',
                        default=False,
                        help='Laplace Smoothing')
    parser.add_argument('--langmod',
                        '-lm',
                        action='store_true',
                        default=False,
                        help='Improved decoder')

    args = parser.parse_args()
    cipher_folder = args.cipher_folder
    laplace = args.laplace
    langmod = args.langmod
    number_of_supp_lines = 100  #the more lines the slower the code!

    train_data, test_data, train_plain = get_data(cipher_folder)
    preprocess_supp_data()
    supp_data = read_preprocessed_supp_data(number_of_supp_lines)
    for line in train_plain:  #this is so later we have all the transitions in the same place
        supp_data.extend(list(line))

    if laplace:
        smoothing = LaplaceProbDist
    else:
        smoothing = MLEProbDist

    trainer = hmm.HiddenMarkovModelTrainer()
    decoder = trainer.train_supervised(train_data, smoothing)

    #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder)
    #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually....
    #code copied from the nltk train_supervised method
    #here, we are updating the transition data to include our supplemental data
    if langmod:
        states = decoder._states
        symbols = decoder._symbols
        outputs = decoder._outputs
        priors = decoder._priors
        starting = FreqDist()  #declaring
        transitions = ConditionalFreqDist(
        )  #declaring, why we needed all the transitions in the same place
        for item in supp_data:
            for sequence in supp_data:
                lasts = None
                for state in sequence:
                    if lasts is None:
                        starting[state] += 1
                    else:
                        transitions[lasts][state] += 1
                    lasts = state

        if laplace:
            estimator = LaplaceProbDist
        else:
            estimator = lambda fdist, bins: MLEProbDist(
                fdist)  #getting this straight from the source code

        N = len(states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        #conditionalPD is actually already defined by our previously trained model as outputs.
        #we don't have new ones!
        decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi)

    print(decoder.test(test_data))
    for sent in test_data:
        print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])

예제 #26

0

파일 보기

파일: usingFrequencyDistribution.py 프로젝트: LucasCosas/machine-learning

#

from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

prob_dist = MLEProbDist(freq_dist)

# P(x) = freq(x)
prob_dist.prob('the')
freq_dist.freq('the')

#
# Estimating the probability distribution for roll2
#

import random
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.probability import MLEProbDist

예제 #27

0

파일 보기

            def estimator(fdist, bins): return MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)

예제 #28

0

파일 보기

def ml_estimator(freqdist):
    return MLEProbDist(freqdist)

예제 #29

0

파일 보기

파일: decipher.py 프로젝트: Catosine/COMP550_NLP

    def train_supervised(self, labelled_sequences, extra_data=False, estimator=None):
        # This is copied from HiddenMarkovModelTrainer

        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[1]
                symbol = token[0]
                if lasts is None:
                    starting[state] += 1
                else:
                    transitions[lasts][state] += 1
                outputs[state][symbol] += 1
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        if extra_data:
            print('-'*20)
            print("Using extra data to calculate transition probability")
            sent = ""
            for word in tqdm(treebank.words()):
                if word == '.':
                    sent = sent[:-1] + word
                    lasts = None
                    for c in sent:
                        if c in list(string.ascii_lowercase)+[' ', ',', '.']:
                            if lasts is not None:
                                transitions[lasts][c] += 1
                        lasts = c
                    sent = ""
                elif word == ',':
                    sent = sent[:-1] + word + ' '
                else:
                    sent += word + ' '

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return hmm.HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)

예제 #30

0

파일 보기

파일: decipher.py 프로젝트: anandkamat05/Natural-Language-Processing-Assignment-2

 # transition prob
 for row in X_train:
     lasts = None
     for ch in list(row):
         if(lasts is not None):
             transitional[lasts][ch] += 1
         lasts = ch
  
 # emission prob
 for row in sequences:
     for pair in row:
         emissional[pair[1]][pair[0]] += 1
 
 if(improved_laplace): 
     print("################## Laplace ####################### \n")
     estimator= nltk.probability.LaplaceProbDist 
 else:
     estimator = lambda fdist, bins: MLEProbDist(fdist)
     
 N = len(symbols)
 PI = estimator(Pi, N)
 A = ConditionalProbDist(transitional, estimator, N)
 B = ConditionalProbDist(emissional, estimator ,N)
  
 tagger = HiddenMarkovModelTagger(states, symbols, A, B, PI)
 print("\n ################## C{} Decryption Results #######################".format(int(i)) )
 for row in test_cipher:
     print(tagger.best_path(row))
 
 print("\n ################## C{} Accuracy Results #######################". format(int(i)) )
 print(tagger.test(tester))

예제 #31

0

파일 보기

파일: task2_1.py 프로젝트: AdiKrasin/nlp_task3


def compute_kl_divergence(mle_dist1, mle_dist2):
    ans = 0
    for p in mle_dist1.freqdist():
        for q in mle_dist2.freqdist():
            if p.rhs() == q.rhs():
                ans += p.prob() * math.log(p.prob() / q.prob())
    return ans


for lhs in lhs_of_prods:
    prods = [
        ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_corpus if
        str(prod.lhs()) == lhs
    ]
    prods_for_toy_pcfg2 = [
        ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=prod.prob()) for prod in productions_toy_pcfg2 if
        str(prod.lhs()) == lhs
    ]
    if len(prods):
        MLE_prob_dist = MLEProbDist(FreqDist(prods))
    if len(prods_for_toy_pcfg2):
        MLE_prob_dist_for_toy_pcfg2 = MLEProbDist(FreqDist(prods_for_toy_pcfg2))
    if not(len(prods) and len(prods_for_toy_pcfg2)):
        print('skipping {} because this nt does not appear in both cases'.format(lhs))
    else:
        print('this is the KL-Divergence {} for this lhs {}'.format(compute_kl_divergence(MLE_prob_dist,
                                                                                          MLE_prob_dist_for_toy_pcfg2),
                                                                    lhs))