def create_word_scores(posWords, negWords): word_fd = FreqDist() #可统计所有词的词频 print(type(word_fd)) cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in negWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores #包括了每个词和这个词的信息量
def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): ''' Gets the high information words using chi square measure ''' word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) return high_info_words
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # Prepare the data data = [] tags = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> for s in train_data: start = ["<s>"] start.extend([tag for (word, tag) in s]) start.extend(["</s>"]) tags.extend(start) for i in range(len(tags) - 1): data.append((tags[i], tags[i + 1])) # Compute the transition model transition_FD = ConditionalFreqDist(data) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: #for each sentence for tuples in sent: #for each pair of (word,tag) in every sentence data.append( (tuples[1], tuples[0].lower())) #list of tuples(tag,word) emission_FD = ConditionalFreqDist(data) # this is the estiamtor used for probability distribution est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = emission_FD.keys() #print(self.states[0]) return self.emission_PD, self.states
def create_word_scores(): posWords = json.load(open('p.json', 'r')) negWords = json.load(open('n.json', 'r')) posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # TODO: prepare the data data = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL <s> and the END SYMBOL </s> for s in train_data: assert (len(s) > 0) data.append(('s', s[0][1])) for i in range(len(s) - 1): data.append((s[i][1], s[i + 1][1])) data.append((s[len(s) - 1][1], '/s')) # TODO compute the transition model cfdist = ConditionalFreqDist(data) cpdist = ConditionalProbDist(cfdist, MyProbDist, 13) transition_FD = cpdist self.transition_PD = transition_FD #print(self.tlprob('VERB','VERB')) #exit() return self.transition_PD
def generate_conditional_prob_dist(training_passage, n): """Given a passage generates ngrams and then subsequently decrements n, where n >= 2 """ ## removing special character and symbols and converting to lower case training_passage = re.sub(r"[^\w\'\?]", ' ', training_passage).lower() ## tokenizing the sanitized passage words = nltk.word_tokenize(training_passage) cfdist_list = [] cpdist_list = [] ## generating cpdist and n_grams for n_plus_one to bigrams for i in range(n, 1, -1): ## generating n_plus_one_grams and converting into list n_grams_generated = list(ngrams(words, i)) ## converting into (n_gram, n+1 words) for prediction n_grams_for_predict = [(n_gram[:-1], n_gram[-1]) for n_gram in n_grams_generated] ## calculating conditionalfrequency for all n_grams cfdist = ConditionalFreqDist(n_grams_for_predict) ## calculating conditional probablitlity of next word for all n_grams cpdist = ConditionalProbDist(cfdist, MLEProbDist) cfdist_list.append(cfdist) cpdist_list.append(cpdist) return cpdist_list
def store_freqdists(self): """ Build NLTK frequency distributions based on feature counts and store them to Redis. """ #TODO: this step and the above may possibly be combined word_fd = FreqDist() label_word_freqdist = ConditionalFreqDist() pos_words = self.r.zrange('positive_wordcounts', 0, -1, withscores=True, desc=True) neg_words = self.r.zrange('negative_wordcounts', 0, -1, withscores=True, desc=True) assert pos_words and neg_words, 'Requires wordcounts to be stored in redis.' #build a condtional freqdist with the feature counts per label for word, count in pos_words: word_fd.inc(word, count) label_word_freqdist['positive'].inc(word, count) for word, count in neg_words: word_fd.inc(word, count) label_word_freqdist['negative'].inc(word, count) self.pickle_store('word_fd', word_fd) self.pickle_store('label_fd', label_word_freqdist)
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True): if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) if os.path.exists(cache_path): bestwords = pickle.load(open(cache_path, 'r')) print 'Loaded from cache' print 'bestwords count = %d' % (len(bestwords)) return bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos_contents = contents[labels == 1] neg_contents = contents[labels != 0] pos_words = set() neg_words = set() for pos_content in pos_contents: pos_words = pos_words.union(word_tokenize(pos_content)) for neg_content in neg_contents: neg_words = neg_words.union(word_tokenize(neg_content)) for word in pos_words: word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in neg_words: word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit] bestwords = set([w for w, s in best]) print 'all words count = %d' % (len(word_scores)) print 'bestwords count = %d' % (len(bestwords)) if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) f = open(cache_path, 'w') pickle.dump(bestwords, f) print 'Dumped to cache' return bestwords
def findName(self, mostCommon=5): if self.name != 0: self.cfdName = ConditionalFreqDist( (word.lower(), tag) for (word, tag) in self.wsj) return [self.name, self.cfdName[self.name].most_common(mostCommon)] else: print("invalid method")
def create_word_scores(pos_words, neg_words, pos_tag, neg_tag): pos_words = list(itertools.chain(*pos_words)) neg_words = list(itertools.chain(*neg_words)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos_words: word_fd[word] += 1 cond_word_fd[pos_tag][word] += 1 for word in neg_words: word_fd[word] += 1 cond_word_fd[neg_tag][word] += 1 pos_word_count = cond_word_fd[pos_tag].N() neg_word_count = cond_word_fd[neg_tag].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[pos_tag][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[neg_tag][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def findAllTags(self, mostCommon=5): self.cfdTagAll = ConditionalFreqDist( (tag, word) for (word, tag) in self.wsj) for tag in sorted(self.cfdTagAll): print(tag, self.cfdTagAll[tag].most_common()) #print(self.cfdTagAll) return dict(self.cfdTagAll)
def create_word_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(sentences): # logging.info(sentences) words = list(itertools.chain(*sentences)) # logging.info(words) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn( n_ii, (n_ix, n_xi), n_xx ) # n_ii is occurances in a label, n_ix is occurance in total, # n_xi is total words in this category, n_xx total words word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) # bitwise or operation return high_info_words
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500): fd = FreqDist() cond_fd = ConditionalFreqDist() word_counts = {} for cat in categories: for word in corpus.words(categories=[cat]): word = word.lower().strip(".!?:,/ ") if not word.isalpha() or word in stopset: continue fd.inc(word) cond_fd[cat].inc(word) word_counts[cat] = cond_fd[cat].N() total_word_count = sum(word_counts.values()) word_scores = collections.defaultdict(int) for word, freq in fd.iteritems(): for cat in categories: cat_word_score = BigramAssocMeasures.chi_sq( cond_fd[cat][word], (freq, word_counts[cat]), total_word_count) word_scores[word] += cat_word_score informative_words = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:count] return set([w for w, s in informative_words])
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ data = [] #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data] for sent in train_data: for (word, tag) in sent: data.append((tag, word.lower())) self.states.append(tag) emission_FD = ConditionalFreqDist(data) lidstone_estimator = lambda emission_FD: LidstoneProbDist( emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(set(self.states)) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # raise NotImplementedError('HMM.emission_model') # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: sent_parsed = list(map(lambda x: (x[1], x[0].lower()), sent)) data.extend(sent_parsed) # TODO compute the emission model #print('pair num:', len(data)) cfdist = ConditionalFreqDist(data) #print(cfdist.conditions()) #print(len(dict(cfdist['ADP']))) cpdist = ConditionalProbDist(cfdist, myProbDist1, 0.01) emission_FD = cpdist self.emission_PD = emission_FD self.states = list(cfdist.conditions()) #print(self.elprob('VERB','is')) #exit() return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ for idx, s in enumerate(train_data): train_data[idx].insert(0, ('<s>', '<s>')) train_data[idx].insert(-1, ('<\s>', '<\s>')) tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1)) for s in train_data) data = itertools.chain.from_iterable(tagGenerators) transition_FD = ConditionalFreqDist(data) lidstone_estimator = lambda emission_FD: LidstoneProbDist( emission_FD, 0.01, emission_FD.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> data = [] for sent in train_data: data.append(("<s>", sent[0][1])) #start symbol for i in range(len(sent) - 1): data.append((sent[i][1], sent[i + 1][1])) data.append((sent[len(sent) - 1][1], "</s>")) #end symbol transition_FD = ConditionalFreqDist(data) #same estimator used for emission_model est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01, transition_FD.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, est) return self.transition_PD
def __init__(self, labeled_sequence, states, transform, alpha1, alpha2, gammaPrior, gammaEmission): self.init = FreqDist() self.transition_bigram = ConditionalFreqDist() self.transition_unigram = FreqDist() self.emission = ConditionalFreqDist() # hyper-parameters for smoothing self.alpha1 = alpha1 self.alpha2 = alpha2 self.gammaPrior = gammaPrior self.gammaEmission = gammaEmission self.states = states self.symbols = [] self.labeled_sequence = transform(labeled_sequence)
def create_word_bigram_scores(): posdata = get_word('static/pos.txt') negdata = get_word('static/neg.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) posbigram_finder = BigramCollocationFinder.from_words(posWords) negbigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = posbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) negBigrams = negbigram_finder.nbest(BigramAssocMeasures.chi_sq, number) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def __setTermsCHISQUARE__(self, size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(), key=lambda (w, s): s, reverse=True)[:size] self.terms = [w for (w, s) in termScore]
def main(): train_file_path = sys.argv[1] # Get the training file path from arguments tokens = get_training_data(train_file_path) # get all the word tag pair tag_freq_dist = FreqDist( tag for (word, tag) in tokens) # get the frequency of all tags word_tag_freq_dist = ConditionalFreqDist( (word, tag) for word, tag in tokens) # proximity_pairs = nltk.bigrams(tokens) # compute the bigrams of tags tag_tag_confidence = nltk.ConditionalFreqDist( (a[1], b[1]) for (a, b) in proximity_pairs ) # compute frequency of occurance of prev tag to current tag test_file_path = sys.argv[2] # Get the test file path from arguments test_tokens = get_test_data(test_file_path) # get all the words of test result_tagged = [] # store each word and their associated tags for i in range(len(test_tokens)): if i == 0: # when there is no previous tag result_tagged.append( (test_tokens[i], get_tagged(test_tokens[i], prev_tag=None, word_tag_freq_dist=word_tag_freq_dist, tag_tag_confidence=tag_tag_confidence, tag_freq_dist=tag_freq_dist))) else: result_tagged.append( (test_tokens[i], get_tagged(test_tokens[i], result_tagged[i - 1][1], word_tag_freq_dist=word_tag_freq_dist, tag_tag_confidence=tag_tag_confidence, tag_freq_dist=tag_freq_dist))) # Rule 1: Tag every word that contains number to "CD" if re.match("(\d+(\.\d+)?)", test_tokens[i]) is not None: result_tagged[-1] = (result_tagged[-1][0], "CD") # Rule 2: If current tag is DT and previous word is "all", change the tag of "all" to "PDT" (Pre Determiner) if (result_tagged[-1][1] == "DT" and test_tokens[i - 1] == "all"): result_tagged[-2] = ("all", "PDT") # Rule 3: If current word is tagged NN i.e. singular noun and the word is capitalized, change the tag to "NNP" i.e. Proper Noun if (result_tagged[-1][1] == "NN" and test_tokens[i][0].isupper()): result_tagged[-1] = (result_tagged[-1][0], "NNP") # Rule 4: If current word is VBN i.e. past participle verb and previous word was capitalized then change the current tag to VBD i.e. past tense verb if (result_tagged[-1][1] == "VBN" and test_tokens[i - 1][0].isupper()): result_tagged[-1] = (result_tagged[-1][0], "VBD") if len(result_tagged) >= 2: # Rule 5: If current tag is VB and previous word was tagged determiner, then change the current tag to NN i.e. singular noun if (result_tagged[-1][1] == "VB" and (result_tagged[-2][1] == "DT")): result_tagged[-1] = (result_tagged[-1][0], "NN") # Rule 6: If current tag is NN and previous word was tagged TO i.e. to, then change the current tag to VB i.e. verb if (result_tagged[-1][1] == "NN" and (result_tagged[-2][1] == "TO")): result_tagged[-1] = (result_tagged[-1][0], "VB") # Rule 7: If current tag is NN and previous word was tagged MD i.e. Modal, then change the current tag to VB i.e. verb if (result_tagged[-1][1] == "NN" and (result_tagged[-2][1] == "MD")): result_tagged[-1] = (result_tagged[-1][0], "VB") for word, tag in result_tagged: # print all the result to STDOUT print("%s %s" % (word, tag))
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # raise NotImplementedError('HMM.transition_model') # prepare the data train_data = [[('<s>', '<s>')] + sentence + [('</s>', '</s>')] for sentence in train_data] tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1)) for s in train_data) data = itertools.chain.from_iterable(tagGenerators) # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> # for s in train_data: # pass # compute the transition model transition_FD = ConditionalFreqDist(data) self.transition_PD = ConditionalProbDist(transition_FD, self.lidstone_estimator) return self.transition_PD
def create_word_bigram_scores(pos_corpus, neg_corpus): word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for corpus in pos_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for corpus in neg_corpus: for word in corpus: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # Prepare the data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [(tag, word.lower()) for pairs in train_data for (word, tag) in pairs] # Compute the emission model emission_FD = ConditionalFreqDist(data) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) for tag, word in data: if tag not in self.states: self.states.append(tag) return self.emission_PD, self.states
def __init__(self, n, alpha=0.1, brown_categories=None): ''' Initializes NgramBase with a list of conditional frequency distributions representing N-grams, (N-1)-grams, ...., bigrams, unigrams from the Brown corpus. ''' self.n = n if brown_categories == None: brown_categories = brown.categories() samples = [[]] * n sents = self._get_sentences(brown_categories) for sent in sents: sent = [word.lower() for word in sent] if sent[-1].isalpha(): sent += ['.'] for index, m in enumerate(range(n, 0, -1)): igrams = ngrams(sent, m) igrams = [(igram[0:m - 1], igram[-1]) for igram in list(igrams)] samples[index] += igrams # list of N-grams with descending values of N self.grams = [] for sample in samples: self.grams += [ConditionalFreqDist(sample)] # multiplier for each level of backoff self.alpha = alpha