示例#1
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        # Preparing Data
        data = []
        for tagged_sentence in train_data:
            sentence_data = [(t, w.lower()) for (w, t) in tagged_sentence]
            data.extend(sentence_data)

        # Lidstone Probability Distribution function with extra bin
        def estimator(fd):
            return LidstoneProbDist(fd, 0.01, fd.B() + 1)

        # Computing Emission Model
        emission_FD = ConditionalFreqDist(data)
        self.emission_PD = ConditionalProbDist(emission_FD, estimator)
        self.states = list(emission_FD.keys())

        return self.emission_PD, self.states
示例#2
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        new_data = []
        for x in range(len(train_data)):
            new_data += train_data[x]

        data = [(tag, word.lower()) for (word, tag) in new_data]
        # print(data[:20])
        # COMPLETED compute the emission model
        emission_FD = ConditionalFreqDist(data)
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = emission_FD.keys()
        #print(self.states[0])

        return self.emission_PD, self.states
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
    #   print(train_data)
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        
        # I want to make train_data into one list of tagged_words with type:(tuple(str,str))
        data = []
        for x in train_data:
        #    data += [ (tag, word.lower() if word.isalpha() else (tag, word)) for (word, tag) in x]  # lower case and check word
            data += [ (tag, word.lower() )for (word, tag) in x]  # lower case

        # TODO compute the emission model
        emission_FD = ConditionalFreqDist(data)
        # need Lidstone bin parameter
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        
        self.states = emission_FD.keys()

        return self.emission_PD, self.states
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.
        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        data = []
        for sent in train_data:  #for each sentence
            for tuples in sent:  #for each pair of (word,tag) in every sentence
                data.append(
                    (tuples[1], tuples[0].lower()))  #list of tuples(tag,word)

        emission_FD = ConditionalFreqDist(data)
        # this is the estiamtor used for probability distribution
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = list(emission_FD.keys())
        #print(self.states)

        return self.emission_PD, self.states
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """

        # prepare data
        # transform a list of lists of tuples to list of tuples
        train_data = [item for sublist in train_data for item in sublist]

        # the data will be a list in form of (tag, word). Then we will use it to count frequency of word given
        # tag which will be used for emission probability estimations.
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # the data object should be a list of tuples of conditions and observations
        # in our case the tuples should be of the form (tag,word) where words are lowercased
        data = [(tag, word.lower()) for (word, tag) in train_data]

        # compute the emission model
        # compute a Conditional Frequency Distribution for words given their tags using our data
        emission_FD = ConditionalFreqDist(data)

        # Compute the Conditional Probability Distribution using the above Conditional Frequency Distribution.
        # Use LidstoneProbDist estimator.
        #self.emission_PD = ConditionalProbDist(emission_FD, LidstoneProbDist, 0.01, bin)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(emission_FD.keys())

        return self.emission_PD, self.states
示例#6
0
def words_by_followers(category):
    """Given a category from the brown corpus, lowercases everything,
    and returns a frequency distribution where the keys are words
    and the counts are the number of different contexts that each word can appear in."""
    bigrams = brown_bigrams(category)
    cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
    fdist = FreqDist()
    for context in cfdist.keys():
        fdist[context] = len(cfdist[context])
    return fdist
示例#7
0
def words_by_followers(category):
    """Given a category from the brown corpus, lowercases everything,
    and returns a frequency distribution where the keys are words
    and the counts are the number of different contexts that each word can appear in."""
    bigrams = brown_bigrams(category)
    cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
    fdist = FreqDist()
    for context in cfdist.keys():
        fdist[context] = len(cfdist[context])
    return fdist
示例#8
0
def count_freqs(infilename, outfile, colK, titleK=4):
    all_vars = {}
    categoryByID = {}
    i = -1

    with open(infilename, 'rU') as f:
        catreader = csv.reader(f, delimiter=',', quotechar='|')
        header = []
        freq_count = ConditionalFreqDist()
        prod_col = 2
        prev_row = []
        for row in catreader:
            if i < 0:
                i += 1
                prod_col = row.index("product_id")
                colK = prod_col  # row.index("category_id")
                titleK = row.index("name")
                user_col = row.index("user_id")
                continue
            if isNum(row[user_col]):
                freq_count[row[user_col]].inc(row[colK])
            try:
                int(row[colK])
            except:
                print i  #prev_row, row
                continue
            if int(row[colK]) not in all_vars.keys():
                all_vars[int(row[colK])] = i
                i += 1
                if isNum(row[titleK]):
                    categoryByID[int(row[colK])] = [row[titleK]]
                else:
                    categoryByID[int(
                        row[colK])] = (row[titleK].lower().split()[-1],
                                       row[titleK].lower())
            prev_row = row
    #for catID in catIDs:
    #    if isNum(catID):# and catID in freq_count.conditions():
    #        all_vars[catID] = i
    #        i+=1
    with open(outfile, 'wb') as outF:
        row = [0] * len(all_vars)
        rating_writer = csv.writer(outF)
        for (var, i) in all_vars.items():
            row[i] = categoryByID[var][0]
        rating_writer.writerow(["user", "total_count"] + row)

        for user in freq_count.keys():
            row = [0] * len(all_vars)
            for cat in freq_count[user].keys():
                row[all_vars[int(cat)]] = 100 * freq_count[user].freq(cat)
            rating_writer.writerow([user, freq_count[user].N()] + row)
            #	rating_writer.writerow([user, cat, 100*freq_count[user].freq(cat)])
        print len(freq_count)
示例#9
0
def count_freqs(infilename, outfile):
    with open(infilename, 'rU') as f:
        catreader = csv.reader(f, delimiter=',', quotechar='|')
        i = 0
        header = []
        freq_count = ConditionalFreqDist()
        for row in catreader:
            freq_count[row[2]].inc(row[3])
    with open(outfile, 'wb') as outF:
        rating_writer = csv.writer(outF)
        for user in freq_count.keys():
            for cat in freq_count[user].keys():
                rating_writer.writerow(
                    [user, cat, 100 * freq_count[user].freq(cat)])
示例#10
0
                    not word.startswith('#') and
                    not word.startswith('http')):
                    stem = s.stem(word.lower())
                else:
                    stem = word.lower()
                if len(stem) == 1 and not stem.isalnum():
                    continue
                if stem in s.stopwords:
                    continue
                if stem != '':
                    fd[stem] += 1
    return fd


# In[123]:

for tag in [u'#nbafinals2015', u'#nbafinals2015_#warriors', u'#warriors']:
    words = {}
    for root, path, files in os.walk(u'tweets/' + tag):
        for fd in Parallel(n_jobs=8)(delayed(processFile)(os.path.join(root, filename)) for filename in files):
            cfd[tag].update(fd)
    cfd['all'].update(cfd[tag])
        


# In[170]:

for tag in sorted(cfd.keys()):
    cfd[tag].plot(25, title=tag)

示例#11
0
### define a fucntion that returns true if the input tag is some form of noun
def is_noun(tag):
	return tag.lower in ['nn', 'nns', 'nn$', 'nn-tl','nn+bez',
                       'nn+hvz', 'nns$','np','np$','mp+bez','nps',
                       'nps$', 'nr','np-tl','nrs','nr$']


### count nouns that occure whithin a window of size 10 ahead of other nouns
for sentence in brown.tagged_sents():
	for (index, tagtuple) in enumerate(sentence):
		(token, tag) = tagtuple
		token = token.lower()
		if token not in stopwords_list and is_noun(tag):
			window = sentence[index + 1: index + 10]
			for (window_token, window_tag) in window:
				window_token = window_token.lower()
				if window_token not in stopwords_list and is_noun(window_tag):
					cfd[token].inc(window_token)
					print 'Irasiau'

print cfd.keys()
print '-' * 100
print cfd['bread']
print cfd['man']
print cfd['man'].max




示例#12
0
def main(db, pwset_id, dryrun, verbose, basepath, tag_type):
#    tags_file = open('grammar/debug.txt', 'w+')
    
    patterns_dist = FreqDist()  # distribution of patterns
    segments_dist = ConditionalFreqDist()  # distribution of segments, grouped by semantic tag
    
    counter = 0
    
    while db.hasNext():
        segments = db.nextPwd()
        password = ''.join([s.word for s in segments])
        tags = []

        segments = expand_gaps(segments)
        
        for s in segments:  # semantic tags
            if tag_type == 'pos':
                tag = classify_by_pos(s)
            elif tag_type == 'backoff':
                tag = classify_semantic_backoff_pos(s)
            elif tag_type == 'word':
                tag = classify_word(s)
            else:
                tag = classify_pos_semantic(s)

            tags.append(tag)
            segments_dist[tag][s.word] += 1
            
        pattern = stringify_pattern(tags)
        
        patterns_dist[pattern] += 1
        
        # outputs the classification results for debugging purposes
        if verbose:
            print_result(password, segments, tags, pattern)

        counter += 1
        if counter % 100000 == 0:
            print "{} passwords processed so far ({:.2%})... ".format(counter, float(counter)/db.sets_size)
         
#     tags_file.close()

    pwset_id = str(pwset_id)
    
    if dryrun:
        return

    # remove previous grammar
    try:
        shutil.rmtree(basepath)
    except OSError: # in case the above folder does not exist 
        pass
    
    # recreate the folders empty
    os.makedirs(os.path.join(basepath, 'nonterminals'))

    with open(os.path.join(basepath, 'rules.txt'), 'w+') as f:
        total = patterns_dist.N()
        for pattern, freq in patterns_dist.most_common():
            f.write('{}\t{}\n'.format(pattern, float(freq)/total))
    
    for tag in segments_dist.keys():
        total = segments_dist[tag].N()
        with open(os.path.join(basepath, 'nonterminals', str(tag) + '.txt'), 'w+') as f:
            for k, v in segments_dist[tag].most_common():
                f.write("{}\t{}\n".format(k, float(v)/total))
from nltk.probability import ConditionalFreqDist

max_n_words = 5

# NLTK 有提供一個不錯的 Utility Class
cfdist = ConditionalFreqDist()
print('begin read raw text file with item description', i_desc_file)
for words in my_sentence_stream(i_desc_file):
    cfdist[1].update(words)
    for i in range(
            2, max_n_words + 2
    ):  # max_n_words +1,because 0-base index, +2, because one-more-word for entropy calculation
        cfdist[i].update(ngrams_plus(words, n=i))

for n in cfdist.keys():
    print('words count n=', n, ', total', cfdist[n].N())

min_occur = 10
mini_bins = 10
theta = 0.3

import math


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


def sigmoid_array(x):
    return 1 / (1 + np.exp(-x))