示例#1
0
    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(FreqDist(
                            {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma}))
                        tag_choice = tag_probdist.generate() # Randomly select the tag
                        lemma_choice = tag_to_lemma[tag_choice] # Set the lemma
                        break
                size -= 1 # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))
示例#2
0
def add_individual(number_individuals, res_address, diagnosis):
    total_individuals = []
    new_address = res_address.sample(number_individuals).to_dict('records')
    for idx in xrange(number_individuals):
        diagnosis_freq_dist = FreqDist(diagnosis)
        diagnosis_prob_dist = MLEProbDist(diagnosis_freq_dist)
        diagnosis_random = diagnosis_prob_dist.generate()
        full_address = new_address[idx]['ADDR_FULL'] + '|' + new_address[idx]['CTYNAME'] + '|' + new_address[idx]['ZIP5']
        gender, age = get_gender_age(new_address[idx])
        new_individual = {'Date_Inf': current_date, 'Gender': gender, 'Age': age, 'Census_Tract': new_address[idx]['GEOID'], 'Address':full_address, 'LON':new_address[idx]['LON'], 'LAT':new_address[idx]['LAT'], 'Diagnosis': diagnosis_random}
        total_individuals.append(new_individual)
    return pd.DataFrame.from_records(total_individuals)
示例#3
0
def get_gender_age(full_address):
    GEOID = full_address['GEOID']
    try:
        age_gender_dist = KC_age_gender.loc[[GEOID]].loc[:,'M0-4':'F85-120']
        age_gender_freq_dist = FreqDist(age_gender_dist)
        age_gender_prob_dist_age_gender = MLEProbDist(age_gender_freq_dist)
        age_gender_random = age_gender_prob_dist_age_gender.generate()
        gender = age_gender_random[0]
        age = age_gender_random[1:]
        return gender, age
    except:
        return np.nan, np.nan
示例#4
0
def gen_sent(ngram):

    global lis

    # "n" contains the ngram number
    n = lis[1]
    #number of required sentences is stored in sent_num
    sent_num = lis[2]
    i = 0
    for i in range(sent_num):
        j = True

        # we are using this window to go through the sentence with n-1 previous
        # words stored in the window
        window = []
        sent = ""
        for size in range(n - 1):
            window.append('<start>')
        while j == True:
            tup_win = tuple(window)
            if tup_win not in ngram.keys():
                sys.exit("We don't have a start line")

            # FreqDist and MLEProbDist function will transform the frequencies to probabilities
            # by performing (item freq/ sum of frequencies)
            freq_dist = FreqDist(ngram[tup_win])

            #prob_dist.generate() will take in the freq-distance and generate a random token
            # according to the distribution
            prob_dist = MLEProbDist(freq_dist)
            next_w = prob_dist.generate()

            #the following condition is used to detect the end of line
            if (next_w == "." or next_w == "?" or next_w == "!"):
                j = False
                sent += next_w
                continue

            #We'd like to make sure the apostrophe token has no space before or after it...
            # ... as well as the begining of the line
            elif (next_w == "m" or next_w == "s" or next_w == "re"
                  or next_w == "," or next_w == "’" or next_w == "ve"
                  or next_w == "t" or tup_win[-1] == '<start>'):
                sent += next_w
            else:
                sent += " %s" % next_w

            #moving the window forward by popping and appending
            window.pop(0)
            window.append(next_w)

        print("\nSentence %s:\n%s" % (i + 1, sent))
示例#5
0
def sentence_generator(gramFreq,numofsentences):
    i = 0
    for  i in range (numofsentences):
        sentenceGen = True
        sentencelist = ()
        generateSentence = ""
        for size in range (int(ngrams)-1):
            sentencelist += ('<start>',)   
        while sentenceGen == True:
            token_dict = {}
            for index, val in ngrams_frequency.items():
                index2 = index[:-1]
                if index2 == sentencelist:
                    token_dict.update({index[-1]: val})

            # generating frequency using the function
            frequencyDistribution = FreqDist(token_dict)

            # generating probability using the function
            probabilityDistribution = MLEProbDist(frequencyDistribution)

            # predicting the next word
            next_word = probabilityDistribution.generate()
            
            # words having ".,?,!"
            if (next_word =="." or next_word == "?" or next_word == "!"):
                sentenceGen = False
                generateSentence += next_word
                continue
            
            # words having , '
            elif (next_word == "," or next_word == "’"):
                generateSentence += next_word
                
            else:
                generateSentence += " %s"%next_word

            if len(sentencelist) != 0 :   
                my_list = list(sentencelist)
                my_list.pop(0)
                my_list.append(next_word)
                sentencelist = tuple(my_list)
        # Display sentences
        print ("\nSentence %s: %s"%(i+1,generateSentence))
def gen_sentence(ngram):
    global arg
    i = 0
    # n in ngrams
    n = arg[1]
    # number of sentences to generate
    m = arg[2]
    for i in range(m):
        j = True
        table = []
        sentence = ""
        for size in range(n - 1):
            table.append('<START>')
        while j == True:
            tuple_table = tuple(table)
            if tuple_table not in ngram.keys():
                # when start is not available
                sys.exit("No start line!")
            # generating frequency
            frequency = FreqDist(ngram[tuple_table])
            # generating probability
            probability = MLEProbDist(frequency)
            # predicting the next word
            pred_word = probability.generate()

            # words having ".,?,!"
            if (pred_word == "." or pred_word == "?" or pred_word == "!"):
                j = False
                sentence += pred_word
                continue
            # words having , ' or START tag
            elif (pred_word == "," or pred_word == "’"
                  or tuple_table[-1] == '<START>'):
                sentence += pred_word
            else:
                sentence += " %s" % pred_word
            table.pop(0)
            table.append(pred_word)
        # Display sentences
        print("\nSentence %s:\n%s" % (i + 1, sentence))
示例#7
0
    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None  # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags,
                    backoff_limit=2,
                    predicate=lambda tag: True,
                    start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas,
                            backoff_limit=2,
                            predicate=lambda lemma: lemma in self._tag_lemmas[
                                tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(
                            FreqDist({
                                tag: freq
                                for tag, freq in tag_choices.items()
                                if tag in tag_to_lemma
                            }))
                        tag_choice = tag_probdist.generate(
                        )  # Randomly select the tag
                        lemma_choice = tag_to_lemma[
                            tag_choice]  # Set the lemma
                        break
                size -= 1  # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words,
                backoff_limit=2,
                predicate=lambda word: word in self._tag_lemma_words[
                    (tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))
示例#8
0
import pickle
import pandas as pd

from nltk.probability import FreqDist, MLEProbDist

KC_age_gender = pd.read_pickle('input/KC_CT_age_gender.pickle')
#print KC_age_gender

print KC_age_gender.loc[['53033032800']].loc[:,'M0-5':'F85-120']

age_gender_dist = KC_age_gender.loc[['53033032800']].loc[:,'M0-5':'F85-120']
freq_dist = FreqDist(age_gender_dist)
prob_dist = MLEProbDist(freq_dist)

print prob_dist.generate()