def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences # Preparing Data data = [] for tagged_sentence in train_data: sentence_data = [(t, w.lower()) for (w, t) in tagged_sentence] data.extend(sentence_data) # Lidstone Probability Distribution function with extra bin def estimator(fd): return LidstoneProbDist(fd, 0.01, fd.B() + 1) # Computing Emission Model emission_FD = ConditionalFreqDist(data) self.emission_PD = ConditionalProbDist(emission_FD, estimator) self.states = list(emission_FD.keys()) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences new_data = [] for x in range(len(train_data)): new_data += train_data[x] data = [(tag, word.lower()) for (word, tag) in new_data] # print(data[:20]) # COMPLETED compute the emission model emission_FD = ConditionalFreqDist(data) est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = emission_FD.keys() #print(self.states[0]) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # print(train_data) # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # I want to make train_data into one list of tagged_words with type:(tuple(str,str)) data = [] for x in train_data: # data += [ (tag, word.lower() if word.isalpha() else (tag, word)) for (word, tag) in x] # lower case and check word data += [ (tag, word.lower() )for (word, tag) in x] # lower case # TODO compute the emission model emission_FD = ConditionalFreqDist(data) # need Lidstone bin parameter lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = emission_FD.keys() return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: #for each sentence for tuples in sent: #for each pair of (word,tag) in every sentence data.append( (tuples[1], tuples[0].lower())) #list of tuples(tag,word) emission_FD = ConditionalFreqDist(data) # this is the estiamtor used for probability distribution est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = list(emission_FD.keys()) #print(self.states) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # prepare data # transform a list of lists of tuples to list of tuples train_data = [item for sublist in train_data for item in sublist] # the data will be a list in form of (tag, word). Then we will use it to count frequency of word given # tag which will be used for emission probability estimations. # Don't forget to lowercase the observation otherwise it mismatches the test data # the data object should be a list of tuples of conditions and observations # in our case the tuples should be of the form (tag,word) where words are lowercased data = [(tag, word.lower()) for (word, tag) in train_data] # compute the emission model # compute a Conditional Frequency Distribution for words given their tags using our data emission_FD = ConditionalFreqDist(data) # Compute the Conditional Probability Distribution using the above Conditional Frequency Distribution. # Use LidstoneProbDist estimator. #self.emission_PD = ConditionalProbDist(emission_FD, LidstoneProbDist, 0.01, bin) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(emission_FD.keys()) return self.emission_PD, self.states
def words_by_followers(category): """Given a category from the brown corpus, lowercases everything, and returns a frequency distribution where the keys are words and the counts are the number of different contexts that each word can appear in.""" bigrams = brown_bigrams(category) cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams) fdist = FreqDist() for context in cfdist.keys(): fdist[context] = len(cfdist[context]) return fdist
def count_freqs(infilename, outfile, colK, titleK=4): all_vars = {} categoryByID = {} i = -1 with open(infilename, 'rU') as f: catreader = csv.reader(f, delimiter=',', quotechar='|') header = [] freq_count = ConditionalFreqDist() prod_col = 2 prev_row = [] for row in catreader: if i < 0: i += 1 prod_col = row.index("product_id") colK = prod_col # row.index("category_id") titleK = row.index("name") user_col = row.index("user_id") continue if isNum(row[user_col]): freq_count[row[user_col]].inc(row[colK]) try: int(row[colK]) except: print i #prev_row, row continue if int(row[colK]) not in all_vars.keys(): all_vars[int(row[colK])] = i i += 1 if isNum(row[titleK]): categoryByID[int(row[colK])] = [row[titleK]] else: categoryByID[int( row[colK])] = (row[titleK].lower().split()[-1], row[titleK].lower()) prev_row = row #for catID in catIDs: # if isNum(catID):# and catID in freq_count.conditions(): # all_vars[catID] = i # i+=1 with open(outfile, 'wb') as outF: row = [0] * len(all_vars) rating_writer = csv.writer(outF) for (var, i) in all_vars.items(): row[i] = categoryByID[var][0] rating_writer.writerow(["user", "total_count"] + row) for user in freq_count.keys(): row = [0] * len(all_vars) for cat in freq_count[user].keys(): row[all_vars[int(cat)]] = 100 * freq_count[user].freq(cat) rating_writer.writerow([user, freq_count[user].N()] + row) # rating_writer.writerow([user, cat, 100*freq_count[user].freq(cat)]) print len(freq_count)
def count_freqs(infilename, outfile): with open(infilename, 'rU') as f: catreader = csv.reader(f, delimiter=',', quotechar='|') i = 0 header = [] freq_count = ConditionalFreqDist() for row in catreader: freq_count[row[2]].inc(row[3]) with open(outfile, 'wb') as outF: rating_writer = csv.writer(outF) for user in freq_count.keys(): for cat in freq_count[user].keys(): rating_writer.writerow( [user, cat, 100 * freq_count[user].freq(cat)])
not word.startswith('#') and not word.startswith('http')): stem = s.stem(word.lower()) else: stem = word.lower() if len(stem) == 1 and not stem.isalnum(): continue if stem in s.stopwords: continue if stem != '': fd[stem] += 1 return fd # In[123]: for tag in [u'#nbafinals2015', u'#nbafinals2015_#warriors', u'#warriors']: words = {} for root, path, files in os.walk(u'tweets/' + tag): for fd in Parallel(n_jobs=8)(delayed(processFile)(os.path.join(root, filename)) for filename in files): cfd[tag].update(fd) cfd['all'].update(cfd[tag]) # In[170]: for tag in sorted(cfd.keys()): cfd[tag].plot(25, title=tag)
### define a fucntion that returns true if the input tag is some form of noun def is_noun(tag): return tag.lower in ['nn', 'nns', 'nn$', 'nn-tl','nn+bez', 'nn+hvz', 'nns$','np','np$','mp+bez','nps', 'nps$', 'nr','np-tl','nrs','nr$'] ### count nouns that occure whithin a window of size 10 ahead of other nouns for sentence in brown.tagged_sents(): for (index, tagtuple) in enumerate(sentence): (token, tag) = tagtuple token = token.lower() if token not in stopwords_list and is_noun(tag): window = sentence[index + 1: index + 10] for (window_token, window_tag) in window: window_token = window_token.lower() if window_token not in stopwords_list and is_noun(window_tag): cfd[token].inc(window_token) print 'Irasiau' print cfd.keys() print '-' * 100 print cfd['bread'] print cfd['man'] print cfd['man'].max
def main(db, pwset_id, dryrun, verbose, basepath, tag_type): # tags_file = open('grammar/debug.txt', 'w+') patterns_dist = FreqDist() # distribution of patterns segments_dist = ConditionalFreqDist() # distribution of segments, grouped by semantic tag counter = 0 while db.hasNext(): segments = db.nextPwd() password = ''.join([s.word for s in segments]) tags = [] segments = expand_gaps(segments) for s in segments: # semantic tags if tag_type == 'pos': tag = classify_by_pos(s) elif tag_type == 'backoff': tag = classify_semantic_backoff_pos(s) elif tag_type == 'word': tag = classify_word(s) else: tag = classify_pos_semantic(s) tags.append(tag) segments_dist[tag][s.word] += 1 pattern = stringify_pattern(tags) patterns_dist[pattern] += 1 # outputs the classification results for debugging purposes if verbose: print_result(password, segments, tags, pattern) counter += 1 if counter % 100000 == 0: print "{} passwords processed so far ({:.2%})... ".format(counter, float(counter)/db.sets_size) # tags_file.close() pwset_id = str(pwset_id) if dryrun: return # remove previous grammar try: shutil.rmtree(basepath) except OSError: # in case the above folder does not exist pass # recreate the folders empty os.makedirs(os.path.join(basepath, 'nonterminals')) with open(os.path.join(basepath, 'rules.txt'), 'w+') as f: total = patterns_dist.N() for pattern, freq in patterns_dist.most_common(): f.write('{}\t{}\n'.format(pattern, float(freq)/total)) for tag in segments_dist.keys(): total = segments_dist[tag].N() with open(os.path.join(basepath, 'nonterminals', str(tag) + '.txt'), 'w+') as f: for k, v in segments_dist[tag].most_common(): f.write("{}\t{}\n".format(k, float(v)/total))
from nltk.probability import ConditionalFreqDist max_n_words = 5 # NLTK 有提供一個不錯的 Utility Class cfdist = ConditionalFreqDist() print('begin read raw text file with item description', i_desc_file) for words in my_sentence_stream(i_desc_file): cfdist[1].update(words) for i in range( 2, max_n_words + 2 ): # max_n_words +1,because 0-base index, +2, because one-more-word for entropy calculation cfdist[i].update(ngrams_plus(words, n=i)) for n in cfdist.keys(): print('words count n=', n, ', total', cfdist[n].N()) min_occur = 10 mini_bins = 10 theta = 0.3 import math def sigmoid(x): return 1 / (1 + math.exp(-x)) def sigmoid_array(x): return 1 / (1 + np.exp(-x))