def test_tabulate(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) with pytest.raises(ValueError): empty.tabulate( conditions="BUG") # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), [])
def visualize_monthly_news_stats2(csvfolder=metacorpus.statspath, csvname=metacorpus.prunedmetafilename, imgoutpath=metacorpus.imgfolder, rescatmap=metacorpus.resourcecategorymap2): colldf = IOtools.readcsv(csvfolder+os.sep+csvname) numoftexts, _ = colldf.values.shape # daily news counts for resources cfddailyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"], colldf.loc[i,"resource"].strip()) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfddailyresourcecount, csvfolder+os.sep+"cfddailyresourcecount2.csv", colnames=['date','resource','count']) #cfdresourcesdaycount = ConditionalFreqDist((resource, day) for day in cfddailyresourcecount.conditions() for resource in list(cfddailyresourcecount[day])) # daily news counts for categories cfddailycategorycount = ConditionalFreqDist((colldf.loc[i,"date"], "_".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfddailycategorycount, csvfolder+os.sep+"cfddailycategorycount2.csv", ["date", "category", 'count']) #cfdcatsdaycount = ConditionalFreqDist((category, date) for date in cfddailycategorycount.conditions() for category in list(cfddailycategorycount[date])) # visualize monthly --- assuming the dates are of the form yyyy-mm-dd -we did it so while recording cfdmonthlyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], colldf.loc[i,"resource"].strip()) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfdmonthlyresourcecount, csvfolder+os.sep+"cfdmonthlyresourcecount.csv", colnames=['month','resource','count']) #cfdresourcesmonthcount = ConditionalFreqDist((resource, month) for month in cfdmonthlyresourcecount.conditions() for resource in list(cfdmonthlyresourcecount[month])) imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "resourcebasednewscount")) visualize_monthly_cfd(cfd=cfdmonthlyresourcecount, figuretitle="Monthly news count for each resource", ylabel="news published", imgoutpath=imgpath) cfdmonthlycategorycount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], "-".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfdmonthlycategorycount, csvfolder+os.sep+"cfdmonthlycategorycount.csv", ["month", "category", 'count']) #cfdcatsmonthcount = ConditionalFreqDist((category, month) for month in cfdmonthlycategorycount.conditions() for category in list(cfdmonthlycategorycount[month])) imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "categorybasednewscount")) for canoniccatname, rescatnamedct in rescatmap.iteritems(): monthresourcepairs = [] for resourcename, origcats in rescatnamedct.iteritems(): for origcatname in origcats: #resourcename = rescat.split("-")[0] rescat = "-".join([resourcename, origcatname]) for month in cfdmonthlycategorycount.conditions(): numofoccurrences = cfdmonthlycategorycount[month][rescat] #print resourcename," had ",numofoccurrences," times texts in :",rescat," during ",month for i in range(numofoccurrences): monthresourcepairs.append((month, resourcename)) cfdmonthlyresourcecount_percat = ConditionalFreqDist(monthresourcepairs) print canoniccatname,resourcename," * ",rescat," : ",len(cfdmonthlyresourcecount_percat.conditions())," ",cfdmonthlyresourcecount_percat.N() figuretitle = "Monthly news count of each resource over category "+canoniccatname.upper() visualize_monthly_cfd(cfdmonthlyresourcecount_percat, figuretitle, ylabel="news published", imgoutpath=imgpath)
def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) try: empty.plot(conditions="BUG") # nonexistent keys shouldn't be added except: pass self.assertEqual(empty.conditions(), [])
def calculate_vector_spaces(self,k=16): cfd = ConditionalFreqDist( (word, doc['document']) for doc in self.mongo[CORPUS_CLN].find() for word in self.interestingWords(doc['document'])) cfd.tabulate() # matrix dimensions terms = [c for c in cfd.conditions()] # conditions = words docs = sorted(set(v for c in cfd.conditions() for v in cfd[c])) self.log("terms: %s"%str(terms)) self.log("docs: %s"%str(docs)) term_by_doc_mat = np.zeros(shape=(len(terms),len(docs))) self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs))) for i, term in enumerate(terms): li = np.array([cfd[term][doc] for doc in docs]) term_by_doc_mat[i] = li self.log("Matrix\n%s"%str(term_by_doc_mat)) # perform singular value decomposition u,sigma,vh = self._do_svd(term_by_doc_mat,k) del term_by_doc_mat # don't need the matrix anymore # map terms to svd space terms_space = np.zeros(shape=(len(terms),k)) for i in xrange(len(terms)): vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1 terms_space[i] = np.array(vals) # map docs to svd space docs_space = np.zeros(shape=(len(docs),k)) for i in xrange(len(docs)): vals = [ vh[i][j] * sigma[j] for j in range(k)] docs_space[i] = np.array(vals) # store matrix data row = self.mongo['data'].find_one() if not row: row = {'terms': terms, 'documents':docs, 'terms_subspace':terms_space.tolist(), 'docs_subspace':docs_space.tolist(), 'u':u.tolist(), 'sigma':sigma.tolist(), 'vh':vh.tolist(), 'date':datetime.utcnow()} else: row['terms'] = terms row['documents'] = docs row['terms_subspace'] = terms_space.tolist() row['docs_subspace'] = docs_space.tolist() row['u'] = u.tolist() row['sigma'] = sigma.tolist() row['vh'] = vh.tolist() row['date'] = datetime.utcnow() self.mongo['data'].save(row) self.log("Saved matrix data")
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for category, words in wordsInCategories: word_fd.update(words) label_word_fd[category].update(words) word_counts = {} for condition in label_word_fd.conditions(): word_counts[condition] = label_word_fd[condition].N() total_word_count = 0 for condition, count in word_counts.items(): total_word_count += count word_scores = {} for word, freq in word_fd.items(): score = 0 for condition, count in word_counts.items(): score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count) word_scores[word] = score best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words] return set([w for w, s in best])
class NgramModel(object): """A simple N-gram model.""" def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist( (context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions()) def prob(self, event, context): """Return the probability for an event in the provided context""" context = tuple(context) try: return self._estimators[context].freq(event) except KeyError: return 0.0 def seqprob(self, seq): """Return the probability of a sequence.""" prob = 1.0 for context, event in _make_ngram_tuples(seq, self._n): prob *= self.prob(event, context) return prob def allngrams(self): """Return all N-grams observed by the model and their probabilities.""" ngram_probs = ((event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist) return sorted(ngram_probs, key=itemgetter(1))
def constructTransitionMatrix(self, sourceFilesList: list): #construction of the transition matrix for fileName in sourceFilesList: file = open(fileName, 'r', encoding="windows-1256") fileFinal = "" for line in file: line = line.upper() if (len(line) > 1): if not line.startswith("<S>"): fileFinal += '<S> ' + line[:-1] + ' <E>\n' else: fileFinal += line[:-1] + '\n' file.close() tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != ''] self.initialProbabilities = FreqDist([ tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>' ]) self.tags = list(set(tokens)) self.bigramDist = FreqDist(list(bigrams(tokens))) Trigrams = list(trigrams(tokens)) cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams)) for word in cfd.conditions(): for bigram in cfd[word]: cfd[word][bigram] = round( float("{0:.6f}".format(cfd[word].freq(bigram))), 6) self.TRANSITION_MATRIX = cfd return cfd
class NgramModel(object): """A simple N-gram model.""" def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions()) def prob(self, event, context): """Return the probability for an event in the provided context""" context = tuple(context) try: return self._estimators[context].freq(event) except KeyError: return 0.0 def seqprob(self, seq): """Return the probability of a sequence.""" prob = 1.0 for context, event in _make_ngram_tuples(seq, self._n): prob *= self.prob(event, context) return prob def allngrams(self): """Return all N-grams observed by the model and their probabilities.""" ngram_probs = ( (event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist ) return sorted(ngram_probs, key=itemgetter(1))
def test_increment(self): # make sure that we can still mutate cfd normally text = "cow cat mouse cat tiger" cfd = ConditionalFreqDist() # create cfd with word length as condition for word in tokenize.word_tokenize(text): condition = len(word) cfd[condition][word] += 1 self.assertEqual(cfd.conditions(), [3,5]) # incrementing previously unseen key is still possible cfd[2]['hi'] += 1 self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this C{ContextTagger}'s L{_context_to_tag} table based on the given training data. In particular, for each context C{I{c}} in the training data, set C{_context_to_tag[I{c}]} to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of C{self._context_to_tag} (if any) is discarded. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of C{(word, tag)} tuples. @param cutoff: If the most likely tag for a context occurs fewer than C{cutoff} times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def nltk_test_3(): # For each token, count current word given previous word. # Create distribution object. # cfd = ConditionalFreqDist() # for word in word_tokenize(sent): # condition = len(word) # cfd[condition][word] += 1 cfd = ConditionalFreqDist((len(word), word) for word in gutenberg.words('austen-persuasion.txt')) # Start predicting at the given word, say ’therefore’ word = 'therefore' i = 1 print cfd.N() print cfd.conditions() # Find all words that can possibly follow the current word and choose one at random while i <= 20: print word, lwords = cfd[word] follower = choice(lwords) word = follower i += 1
def learn(self, A): total_y = float(len(A)) self.cls_fd = cls_fd = FreqDist() self.feature_fd = feature_fd = FreqDist() pairs = [] for x, y in A: cls_fd.inc(y) for feature in set(get_words(x)): pairs.append((y, feature)) feature_fd.inc(feature) cfd = ConditionalFreqDist(pairs) if DEBUG: print cfd print cfd.conditions() #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config']) cfd.tabulate() for author in cfd.conditions(): print 'AUTHOR:', author for word, count in cfd[author].items(): print '%5d %20s' % (count, word) self.voc = voc = feature_fd.keys() self.cls_feature_prob = cls_feature_prob = {} self.cls_and_feature_prob = cls_and_feature_prob = {} for cls, total in cls_fd.items(): fd = cfd[cls] cls_feature_prob[cls] = wc = {} for word in voc: if word in fd: cls_feature_prob[(cls, word)] = float(fd[word]) / total cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y else: cls_feature_prob[(cls, word)] = 1. / total cls_and_feature_prob[(cls, word)] = 1. / total_y self.feature_prob = feature_prob = {} for word, count in feature_fd.items(): feature_prob[word] = count / total_y
def subword_char_ngram(text_fileid_map, n): corpus_ngramitems = [] for tid, text in text_fileid_map.iteritems(): words = text.split() ngramitems = [] for w in words: ngramitems.extend(ngrams(w, n)) for ngramitem in ngramitems: corpus_ngramitems.append((tid, ngramitem)) cfd = ConditionalFreqDist(corpus_ngramitems) print cfd.N()," ",len(cfd.conditions()) return cfd CFDhelpers.cfd2csv(cfd=cfd, csvpath=csvpath)
def postags(self, pos=None, sort=False, top=0, universal_tagset=False, ret_cond=False): '''Создает частотные словари или отсортированные по частоте списки частей речи''' def merge(tags): result = FreqDist() for tag in tags: result += cfd[tag] return result maps = { 'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'}, 'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}, 'ADJ': {'JJ', 'JJR', 'JJS'}, 'ADV': {'RB', 'RBR' 'RBS'}, } cfd = ConditionalFreqDist() for sent in self._sents: #tokens = sent.untagging() tokens = sent.tags for tok, tag, lemma in tokens: cfd[tag][lemma.lower()] += 1 cond = cfd.conditions() result = cfd if pos: if not universal_tagset and pos in maps: result = merge(maps[pos]) else: result = cfd[pos] if top: result = _top(result, top) else: result = _sort(result, sort) if ret_cond: result = result, cond return result
def init_prob_unit(): # initialize uniform prob distribution to t(e|f) print("Initializing Uniform Prob distribution") N = len(de_inp) if N != len(en_inp): print("number of lines in src and target don't match!") ten_de = CondFDist() for num in range(N): for de_word in de_inp[num].split(): for en_word in en_inp[num].split(): ten_de[de_word].inc(en_word) # make probs uniform for de_word in ten_de.conditions(): for key in ten_de[de_word].keys(): ten_de[de_word][key] = 1.0 / len(ten_de[de_word]) # print(ten_de[de_word][key]) return ten_de
def visualize_monthly_cfd(cfd, figuretitle, ylabel, imgoutpath): cfd_reverse = ConditionalFreqDist((entity, month) for month in cfd.conditions() for entity in list(cfd[month])) months = cfd.conditions() months.sort() barlabels = cfd_reverse.conditions() #print months print barlabels yitemsmatrix = [] for entity in barlabels: row = [] for month in months: row.append(cfd[month][entity]) yitemsmatrix.append(row) if len(barlabels) == 0 or len(yitemsmatrix) == 0: return yitemsmatrix = np.array(yitemsmatrix) #yitemsmatrix = yitemsmatrix.T print yitemsmatrix.shape colors = plotter.get_n_colors(len(barlabels)) months = map(lambda x : str(x), months) # partition the figure in case x axis gets too large by the number of months numofxitems = 5 numoffigures = (len(months) / numofxitems ) + 1 for i in range(numoffigures): matrix = yitemsmatrix[:, (i*numofxitems) : ((i+1)*numofxitems)] print matrix xlabels = months[(i*numofxitems) : ((i+1)*numofxitems)] # save fig. pass img path with i figurename = figuretitle + " "+ str(i) cfdplotter.multiplebargraphs(barlabels, matrix.tolist(), colors, figurename, xlabels, ylabel, imgpath=imgoutpath)
from load_data import get_df, select_columns from itertools import combinations from nltk import ConditionalFreqDist # get the data as a dataframe df = get_df(shortname='clean_apx') mask_data_source = df['DataSource'] == 'APX' df_select = df[mask_data_source] # choose subset of columns and cast all values as string df_select = df_select[select_columns].astype(str) # choose a smaller subset of columns to analyse report_columns = select_columns[1:5] # a list of all pairwise combinations combo_count = 2 groupby_columns = list(combinations(report_columns, combo_count)) # create a list of tuples groupby_column = list(groupby_columns[0]) arr = df_select[list(groupby_column)].values pairs = list(tuple(map(tuple, arr))) # and now for the good stuff cfd = ConditionalFreqDist(pairs) conditions = cfd.conditions() import pdb pdb.set_trace()
def inspect(self, missed): """ Inspect a testing session, and print data about tag accuracy :param missed: list of tuples of missed tags like: (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context) """ # create a CFD so we can examine a matrix of incorrect vs correct tags # ms[1][1] = tag of a gold_tagged_word # ms[0][1] = tag of an hmm_tagged_word cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed) # initialize a hash to store mistakes by frequency mistakes = {} # print a table showing mistake frequency cfd.tabulate() msg("\n") # loop through mistake frequencies by gold standard tag, i.e., if we are # examining gold-standard 'IN', count what we incorrectly tagged it as conds = cfd.conditions() for g_tag in conds: for hmm_tag in cfd[g_tag].keys(): # how many times did we incorrectly say g_tag was hmm_tag? count = cfd[g_tag][hmm_tag] # add these mistakes to the count if count not in mistakes.keys(): mistakes[count] = [] mistakes[count].append((hmm_tag, g_tag)) # get a list of all mistake types that occurred over a threshold, worst first mistake_counts = set([count for (count, mistake_set) in \ mistakes.iteritems() if count > Tagger.mistake_threshold]) mistake_counts = reversed(sorted(mistake_counts)) # now create a list of mistake types to show the user, i.e., loop # through all types and if they are of a high-frequency type, add to list mistakes_to_halt = [] for count in mistake_counts: mistake_set = mistakes[count] for mistake_tuple in mistake_set: mistakes_to_halt.append(mistake_tuple) msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \ mistake_tuple[1])) msg("\n") # create separators used when outputting missed word contexts sep_big = "---------------------------------------------------\n" sep_small = "\n-----------------------------------------\n" # loop through individual mistakes and, if they match the kind of error # we want to halt for, show the user the mistake as well as the sentence # context for both the gold-standard sentence and the hmm-tagged sentence response = None for missed_set in missed: if response not in ['q', 'Q']: (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \ gold_tagged_sent) = missed_set should_halt = False # determine whether the current mistake matches a mistake type # we want to halt for for pair in mistakes_to_halt: if hmm_tagged_word[1] == pair[0] and \ gold_tagged_word[1] == pair[1]: should_halt = True if should_halt: msg("%sTagged '%s' with %s when it should have been %s.%s" %\ (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\ gold_tagged_word[1], sep_small)) msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \ gold_tagged_sent]))) msg(sep_small) msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \ hmm_tagged_sent]))) # get user input to decide whether to keep going response = raw_input("\n\nEnter to continue, Q to quit: ")
from load_data import get_df, select_columns from itertools import combinations from nltk import ConditionalFreqDist # get the data as a dataframe df = get_df(shortname='clean_apx') mask_data_source = df['DataSource'] == 'APX' df_select = df[mask_data_source] # choose subset of columns and cast all values as string df_select = df_select[select_columns].astype(str) # choose a smaller subset of columns to analyse report_columns = select_columns[1:5] # a list of all pairwise combinations combo_count = 2 groupby_columns = list(combinations(report_columns, combo_count)) # create a list of tuples groupby_column = list(groupby_columns[0]) arr = df_select[list(groupby_column)].values pairs = list(tuple(map(tuple, arr))) # and now for the good stuff cfd = ConditionalFreqDist(pairs) conditions = cfd.conditions() import pdb; pdb.set_trace()
if token['lemma']: lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos']) lemma_pairs.append((token['lemma'], short_tag)) lemma_long_pairs.append((token['lemma'], long_tag)) tagged_pairs.append((token['textlc'], short_tag)) # Print vocabularies for each tag type for tag_type in tag_types: vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type]) print vocabulary_cfd.tabulate() #events_cfd = ConditionalFreqDist(tagged_pairs) # Conditional frequency distribution for (lemma, tag) pairs events_cfd = ConditionalFreqDist(lemma_pairs) unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2] ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1] print "Unambiguous Words" print events_cfd.tabulate(conditions=unambiguous_words) print "Ambiguous Words" print events_cfd.tabulate(conditions=ambiguous_words) sum_ambiguous_words = (sum(events_cfd[word].N() for word in ambiguous_words)) sum_unambiguous_words = (sum(events_cfd[word].N() for word in unambiguous_words)) total = sum_ambiguous_words + sum_unambiguous_words percentage = float(sum_ambiguous_words) / float(total)
def find_word_matrices(self, newsidlist, processcontent=True, prepend="content"): dateroots = [] datePOStag = [] titleexclamation = [("newsid", "title_exclamation")] textPOStag = [] textroots = [] textrootsWpostag = [] textliterals = [] print prepend, " processing:" for newsid in newsidlist: print "newsid ",newsid filepath = extractnewsmetadata.newsid_to_filepath(newsid) content, title, date = extractnewsmetadata.get_news_article2(filepath) text = "" if processcontent: text = content else: text = title if "!" in title: titleexclamation.append((newsid, 1)) else: titleexclamation.append((newsid, 0)) words = texter.getwords(text) lemmata = SAKsParser.lemmatize_lexicon(words) for (literal, literalPOS, root, rootPOS) in lemmata: root = texter.cleanword(root) if (len(root) > 0) or (not root.isspace()): #print root, textPOStag.append((newsid, literalPOS)) textroots.append((newsid, root)) textrootsWpostag.append((newsid, root+" Wpostag "+rootPOS)) textliterals.append((newsid, literal+" Wpostag "+literalPOS)) dateroots.append((date, root)) datePOStag.append((date, literalPOS)) cfd_dateroots = ConditionalFreqDist(dateroots) cfd_datepostag = ConditionalFreqDist(datePOStag) cfd_textpostag = ConditionalFreqDist(textPOStag) cfd_textroots = ConditionalFreqDist(textroots) cfd_textrootWpostag = ConditionalFreqDist(textrootsWpostag) cfd_textliterals = ConditionalFreqDist(textliterals) print "some id's", cfd_textroots.conditions() cfd_roottext = ConditionalFreqDist((word, docid) for docid in cfd_textroots.conditions() for word in list(cfd_textroots[docid])) # cfd to csv conditems as cols duzelt: csvpath = os.path.join(self.matrixpath, prepend+"-dateroot.csv") CFDhelpers.cfd_to_matrix(cfd_dateroots, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"-datepostag.csv") CFDhelpers.cfd_to_matrix(cfd_datepostag, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"-postagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textpostag, csvpath) termcountcsvpath = os.path.join(self.matrixpath, prepend+"termCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textroots, termcountcsvpath) tfidfcsvpath = os.path.join(self.matrixpath, prepend+"termTFIDF.csv") texter.compute_tfidf_ondisc(termcountcsvpath, tfidfcsvpath) csvpath = os.path.join(self.matrixpath, prepend+"-rootcountindex.csv") CFDhelpers.cfd_to_matrix(cfd_roottext, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"rootWpostagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textrootWpostag, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"literalWpostagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textliterals, csvpath) # diger csv'lerden devam 6 Subat 05:42 uyuyuyuyuyuyu # kalklaklkalklklaklaklkal 15:32 if not processcontent: print "keep exclamation !" IOtools.tocsv_lst(titleexclamation, os.path.join(self.matrixpath, prepend+"-exclamation.csv"))
class HMMTagger(object): global START_TAG START_TAG = "<s>" global END_TAG END_TAG = "</s>" global UNK UNK = "UNK" def __init__(self, training_sents, n=2, smoothing=None): self.n = n self.smoothing = smoothing self.tagged_sents = self.addStartAndEndMarkers( training_sents) # this takes a lot of time self.train() # this takes almost 4 seconds def train(self): """ Construct the conditional frequencies and probabilities """ #extract tags from sentences tags = [tag for (_, tag) in self.tagged_sents] self.replaceUnique() self.emission_frequencies = ConditionalFreqDist( [tup[::-1] for tup in self.tagged_sents]) self.tagset_size = len(self.emission_frequencies.conditions()) # emission - probability that a certain tag is a certain word # e.g. probability that a VB is 'race' self.emission_probabilities = ConditionalProbDist( self.emission_frequencies, MLEProbDist) self.transition_frequencies = ConditionalFreqDist(bigrams(tags)) self.transition_probabilities = ConditionalProbDist( self.transition_frequencies, MLEProbDist) self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents) def replaceUnique(self): """ Replaces unique words with the UNK label """ word_frequencies = FreqDist([word for (word, _) in self.tagged_sents]) self.lexicon_size = len(word_frequencies) hap = set(word_frequencies.hapaxes()) res = [(UNK, tag) if word in hap else (word, tag) for (word, tag) in self.tagged_sents] self.tagged_sents = res def addStartAndEndMarkers(self, training_sents): """ returns a flat list of tokens """ res = [] for sent in training_sents: res += [(START_TAG, START_TAG)] res += sent res += [(END_TAG, END_TAG)] return res def get_transition_probability(self, prev_tag, tag): """ Returns probability of prev_tag being followed by tag. Performs smoothing if specified in the command line.""" if self.smoothing == "LAP": prev_tag_count = self.transition_frequencies[prev_tag].N() bigram_count = self.transition_frequencies[prev_tag].freq( tag) * prev_tag_count return (bigram_count + 1) / (1.0 * prev_tag_count + self.lexicon_size) else: return self.transition_probabilities[prev_tag].prob(tag) def viterbi_col(self, word, prev=None): """ General algorithm for a viterbi table column. This is only called once for every word. """ vit = {} back = {} for tag in self.word_tag_frequencies[word].keys(): if tag != START_TAG: if prev: best_prev_tag = self.get_prev_tag(tag, prev, word) transition_prob = self.get_transition_probability( best_prev_tag, tag) vit[tag] = prev[ best_prev_tag] * transition_prob * self.emission_probabilities[ tag].prob(word) back[tag] = best_prev_tag else: transition_prob = self.get_transition_probability( START_TAG, tag) vit[tag] = transition_prob * self.emission_probabilities[ tag].prob(word) back[tag] = START_TAG return (vit, back) def viterbi(self, words_to_tag): """ Viterbi algorithm """ res = [ ] # a list of dicts denoting probability of best path to get to state q after scanning input up to pos i backpointers = [] # a list of dicts for wordindex in range(len(words_to_tag)): current_word = words_to_tag[wordindex] if self.is_unknown(current_word): current_word = UNK if wordindex == 0: vit, back = self.viterbi_col(current_word) else: vit, back = self.viterbi_col(current_word, res[-1]) res.append(vit) backpointers.append(back) prev = res[-1] backpointers.reverse() return self.construct_solution(backpointers, prev) def is_unknown(self, word): """ Checks if the word is unknown """ for tag in set(self.emission_probabilities.conditions()): pr = self.emission_probabilities[tag] if pr.prob(word) > 0: return False return True def construct_solution(self, back, prev): """ Constructs solution by following the back pointers on a ready viterbi table """ current_best_tag = self.get_prev_tag(END_TAG, prev) best_seq = [END_TAG, current_best_tag] for p in back: to_append = p[current_best_tag] best_seq.append(to_append) current_best_tag = p[current_best_tag] best_seq.reverse() return best_seq def get_prev_tag(self, tag, prev, curr_word=None): """ Finds a previous tag A for the current tag B s.t. the probability of AB was the highest for the current word. Called for every word and every tag """ best_prev = prev.keys()[ 0] # assign at least something to avoid None exception best_prob = 0.0 for prevtag in prev.keys(): # find the maximum probability prob = prev[prevtag] * self.transition_probabilities[prevtag].prob( tag) if curr_word: prob *= self.emission_probabilities[tag].prob(curr_word) if prob > best_prob: best_prob = prob best_prev = prevtag return best_prev def tag_sents(self, test_sents): """Tag the given text sentence by sentence""" res = [] for sent in test_sents: res.append(self.viterbi(sent)[1:-1]) # remove start and end tags return res
from nltk import ConditionalFreqDist from nltk.corpus import brown # cfd = ConditionalFreqDist( # (genre, word) # for genre in brown.categories() # for word in brown.words(categories=genre) # ) # print(len(cfd)) # 15 (categories) cfd = ConditionalFreqDist((genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)) print(cfd) # 2 (categories) print(cfd.conditions()) print(cfd['romance']) # FreqDist with 8452 samples and 70022 outcomes
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import ConditionalFreqDist cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア cfdist.conditions() # アルファード順にソートされた条件のリスト cfdist['条件'] # 指定された条件の頻度分布 cfdist['条件'][sample] # cfdist.tablate() cfdist.tablate(samples,conditions) cfdist.plot() cfdist.plot(samples,conditions) cfdist1 < cfdist2
def pos_percentages(words, tag='NN'): cfd = ConditionalFreqDist((tag,1) for word,tag in tagger.tag(words)) relevant_tags = filter(lambda c: re.match(tag,c), cfd.conditions()) sum_tags = sum([ cfd[c].N() for c in relevant_tags ]) return float(sum_tags)/float(len(words))
print([word for word in sen_words if len(word) == 6 ]) # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊... # 情感值, 正负面 anls = [ word for words in df[df.columns[4]] for word in words.split(';') if len(word) ] print(len(anls)) # 45648 ## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的 print(sen_words[:10] ) # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好'] print(anls[:10]) # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1'] con = ConditionalFreqDist(zip(sen_words, anls)) print(con) # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了 print([ condition for condition in con.conditions() if len(con[condition].keys()) > 1 ]) # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1) ## 将 theme, sentiment_word, anls 存 with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f: f.write('\n'.join(themes)) with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f: f.write('\n'.join(sen_words)) with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f: f.write('\n'.join(word + ' ' + anls for word, anls in zip(sen_words, anls))) ################################################################## ## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存 # df = xlsx.parse("Sheet1") # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了 contents = [str(word) for word in list(df[df.columns[1]].values)] print(contents[:10])
# coding: utf-8 import nltk from nltk import ConditionalFreqDist from nltk.corpus import brown from nltk.corpus import names from nltk.corpus import inaugural from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([ condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1 ]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print( brown.categories()
def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), [])
print(set([len(word) for word in themes])) # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6 print([word for word in themes if len(word) == 6]) # ['iphone', 'iphone', 'iphone', 'iphone']; 居然是英文的... # 情感关键词 sen_words = [word for words in df[df.columns[3]] for word in words.split(';') if len(word)] print(len(sen_words)) # 45648 print(set([len(word) for word in sen_words])) # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6 print([word for word in sen_words if len(word) == 6]) # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊... # 情感值, 正负面 anls = [word for words in df[df.columns[4]] for word in words.split(';') if len(word)] print(len(anls)) # 45648 ## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的 print(sen_words[:10]) # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好'] print(anls[:10]) # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1'] con = ConditionalFreqDist(zip(sen_words, anls)) print(con) # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了 print([condition for condition in con.conditions() if len(con[condition].keys()) > 1]) # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1) ## 将 theme, sentiment_word, anls 存 with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f: f.write('\n'.join(themes)) with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f: f.write('\n'.join(sen_words)) with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f: f.write('\n'.join(word + ' ' + anls for word, anls in zip(sen_words, anls))) ################################################################## ## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存 # df = xlsx.parse("Sheet1") # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了 contents = [str(word) for word in list(df[df.columns[1]].values)]; print(contents[:10]) themes = [str(word) for word in list(df[df.columns[2]].values)]; print(themes[:10]) words = [str(word) for word in list(df[df.columns[3]].values)]; print(words[:10]) anls = [str(word) for word in list(df[df.columns[4]].values)]; print(anls[:10]) print('len of contents:', len(contents)) # len of contents: 20000 print('len of words:', len(words)) # len of words: 20000 ## jieba 分词添加 themes, words dict_themes = [word for line in themes for word in line.strip().split(';') if len(word) and word != 'NULL']
#!/usr/bin/python #coding=utf-8 from nltk import ConditionalFreqDist from nltk.corpus import brown words = brown.tagged_words(tagset = 'universal') # 哪个词的不同词性标记数目最多? maximumTagNumber = 0 result = '' cfd = ConditionalFreqDist((word.lower(), tag) for (word, tag) in words) for word in cfd.conditions(): if len(cfd[word]) > maximumTagNumber: maximumTagNumber = len(cfd[word]) result = word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')' elif len(cfd[word]) == maximumTagNumber: result += '\n' + word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')' print result
# for each tagged sentence in the corpus, get the (token, tag) pair and update # both count(tag) and count(tag given token) for sentence in brown.tagged_sents(): for (token, tag) in sentence: fd[tag] += 1 cfd[token][tag] += 1 # The most frequent tag is ... print(fd.max()) # Initialize a list to hold (numtags,word) tuple wordbins = [] # Append each (n(unique tags for token),token) tuple to list for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) # Sort tuples by number of unique tags (highest first) wordbins.sort(reverse=True) # The token with max. no. of tags is ... print(wordbins[0]) # masculine pronouns male = ['he', 'his', 'him', 'himself'] # feminine pronouns female = ['she', 'hers', 'her', 'herself'] # initialize counters
from scipy.sparse import lil_matrix,csr_matrix fids = [reuters.fileids()[0]] docs = [[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(reuters.raw(fileids=[fid]))] for fid in fids] word_list = sorted(set(word for doc in docs for sent in doc for word in sent)) word_dict = dict((word,i) for i,word in enumerate(word_list)) idx_docs = [[[word_dict[word] for word in sent] for sent in doc]for doc in docs] trigram_docs = [[discount.ngrams2(sent,3) for sent in doc] for doc in idx_docs] tri_fd = CFreqDist(gram for doc in trigram_docs for sent in doc for gram in sent) l = len(word_list) # 最尤推定 float(tri_fd[cond][word])/tri_fd[cond] A = lil_matrix(((l+1)**2,l+1)) for cond in tri_fd.conditions(): n = float(tri_fd[cond].N()) for word,val in tri_fd[cond].items(): A[cond[0]*(l+1)+cond[1],word] = val/ n A = lil_matrix(((l+1)**2,l+1),dtype=int) for cond in tri_fd.conditions(): for word,val in tri_fd[cond].items(): A[cond[0]*(l+1)+cond[1],word] = val # 最尤推定 B = A.tocsr() B.toarray().astype(float)/B.sum(1) # 加算スムージング a=0.5
def inspect(self, missed): """ Inspect a testing session, and print data about tag accuracy :param missed: list of tuples of missed tags like: (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context) """ # create a CFD so we can examine a matrix of incorrect vs correct tags # ms[1][1] = tag of a gold_tagged_word # ms[0][1] = tag of an hmm_tagged_word cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed) # initialize a hash to store mistakes by frequency mistakes = {} # print a table showing mistake frequency cfd.tabulate() msg("\n") # loop through mistake frequencies by gold standard tag, i.e., if we are # examining gold-standard 'IN', count what we incorrectly tagged it as conds = cfd.conditions() for g_tag in conds: for hmm_tag in cfd[g_tag].keys(): # how many times did we incorrectly say g_tag was hmm_tag? count = cfd[g_tag][hmm_tag] # add these mistakes to the count if count not in mistakes.keys(): mistakes[count] = [] mistakes[count].append((hmm_tag, g_tag)) # get a list of all mistake types that occurred over a threshold, worst first mistake_counts = set([count for (count, mistake_set) in \ mistakes.iteritems() if count > Tagger.mistake_threshold]) mistake_counts = reversed(sorted(mistake_counts)) # now create a list of mistake types to show the user, i.e., loop # through all types and if they are of a high-frequency type, add to list mistakes_to_halt = [] for count in mistake_counts: mistake_set = mistakes[count] for mistake_tuple in mistake_set: mistakes_to_halt.append(mistake_tuple) msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \ mistake_tuple[1])) msg("\n") # create separators used when outputting missed word contexts sep_big = "---------------------------------------------------\n" sep_small = "\n-----------------------------------------\n" # loop through individual mistakes and, if they match the kind of error # we want to halt for, show the user the mistake as well as the sentence # context for both the gold-standard sentence and the hmm-tagged sentence response = None for missed_set in missed: if response not in ['q','Q']: (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \ gold_tagged_sent) = missed_set should_halt = False # determine whether the current mistake matches a mistake type # we want to halt for for pair in mistakes_to_halt: if hmm_tagged_word[1] == pair[0] and \ gold_tagged_word[1] == pair[1]: should_halt = True if should_halt: msg("%sTagged '%s' with %s when it should have been %s.%s" %\ (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\ gold_tagged_word[1], sep_small)) msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \ gold_tagged_sent]))) msg(sep_small) msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \ hmm_tagged_sent]))) # get user input to decide whether to keep going response = raw_input("\n\nEnter to continue, Q to quit: ")
# coding: utf-8 import nltk from nltk import ConditionalFreqDist from nltk.corpus import brown from nltk.corpus import names from nltk.corpus import inaugural from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词 print(cfd.tabulate(conditions=genres, samples=modals)) # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could