def train(self, corpus, alpha=0.1): self.set_attributes = list() for key in corpus[0][0].keys(): self.set_attributes.append(key) self.labels = list(set([l for w, l in corpus])) self.count_wl = dict() for i in self.set_attributes: self.count_wl[i] = dict() for l in self.labels: self.count_wl[i][l] = defaultdict(lambda: 0) self.probability_wl = dict() for i in self.set_attributes: self.probability_wl[i] = dict() for l in self.labels: self.probability_wl[i][l] = defaultdict(lambda: 0) self.count_l = defaultdict(lambda: 0) for fs, l in corpus: for key, val in fs.items(): self.count_wl[key][l][val] += 1 self.count_l[l] += 1 for key in self.labels: self.probability_l[key] = float(self.count_l[key]) / np.sum( list(self.count_l.values()))
def makePercentDict(): decimalDict = nltk.defaultdict(lambda: 0) # 1 6 0 6 exponentDict = nltk.defaultdict(lambda: 0) # 1 6 0 6 lnDict = nltk.defaultdict(lambda: 0) # 1 6 0 6 getcontext().prec = 20 for gram in allDict: if allDict.has_key(gram): value = allDict.get(gram) new_value = Decimal((float(value) / float(total))) #gram 6a6, count 1 #print new_value #prints full decimal 0.000001750280482447312181777129785048053950645590955950691 exponents = expm1( new_value) #prints with exponenets 1.75028201419e-06 #logged_value = log(float(new_value)) #Math Domain Error #because log(0) if new_value == 0: logged_value = "Undefined" else: logged_value = Decimal(new_value).ln() #log decimalDict[gram] = new_value exponentDict[gram] = exponents lnDict[gram] = logged_value return decimalDict, exponentDict, lnDict
def __init__(self,corpus=None,alpha=0.1): # if corpus!=None: self.c_uvs=defaultdict(lambda :0) self.c_uv=defaultdict(lambda :0) self.c_xs=defaultdict(lambda :0) self.c_s=defaultdict(lambda :0) self.alpha=alpha
def constructEmissionMatrix(self, sourceFilesList: list): # construction of the emission matrix emission = defaultdict(dict) for tag in NE_TAG_lABELS: emission[tag] = defaultdict(float) for fileName in sourceFilesList: file = open(fileName, 'r', encoding='windows-1256') for line in file: words = re.split("\s+", line) entite = '' for word in words: word = self.stemmer.stem(word) if (re.findall('[A-Z]+', word) == []): entite = word continue if not word in emission: emission[word] = defaultdict(float) emission[word][entite] += 1 file.close() for tag in emission.keys(): somme = 0.0 for value in emission[tag].values(): somme += value for word in emission[tag].keys(): emission[tag][word] = round( float("{0:.6f}".format(emission[tag][word] / somme)), 6) self.EMISSION_MATRIX = emission return emission
def write_lex_file_srilm(o, corpus, lex, lang, iter, order, smoothing, lm3): get_lm_model_srilm(corpus, lang, order, 'addsmooth', smoothing) outfile = open(o, "w") lengths_needed = nltk.defaultdict(int) freqs = nltk.defaultdict(list) for c in lex: outfile.write(",".join( str(x) for x in [-1, c[0], c[1], c[2], c[3], c[15]]) + "\n") lengths_needed[len(c[0])] += 1 freqs[len(c[0])] += [c[1]] for i in range(iter): temp_freqs = nltk.defaultdict(list) for k in freqs.keys(): temp_freqs[k] = list(freqs[k]) random.shuffle(temp_freqs[k]) gen_lex, freqs_lex = generate_correct_number_srilm( corpus, lang, lengths_needed, temp_freqs, order) mps_lex = get_mps(gen_lex) for w in gen_lex: outfile.write(",".join( str(x) for x in [i, w, freqs_lex[w], lm3.evaluate(w)[2], mps_lex[w], len(w)]) + "\n") print "generated lexicon: ", str(i) outfile.close() return o
def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature C{(fname,fval)} is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label. max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in self._feature_probdist.items(): for fval in probdist.samples(): feature = (fname, fval) features.add( feature ) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. features = sorted(features, key=lambda feature: minprob[feature]/maxprob[feature]) return features[:n]
def generate_correct_number_pool(n, corpus, h**o, pool, eps = 1e-6): """Generate number of words to match length and p(word) from ngram model using a pool a pre-generated word""" x = NgramModel(n, corpus, 1, h**o) poolxLengths = nltk.defaultdict(list) poolxP = nltk.defaultdict(float) for item in pool: item = item.strip().split(",") poolxLengths[int(item[0])].append(item[1]) poolxP[item[1]] = float(item[2]) same_length = nltk.defaultdict(int) for i in range(20): same_length[i] = dict([(k, poolxP[k]) for k in poolxLengths[i] if k in poolxP]) newwords = [] exist =0 hom_count =0 tot_p=0 for i, w in enumerate(corpus): p_match = x.evaluate(w) sample = get_range(same_length[len(w)], p_match-eps, p_match+eps) while len(sample) == 0: eps = eps*2 sample = get_range(same_length[len(w)], p_match-eps, p_match+eps) eps = 1e-6 nw = random.choice(sample.keys()) while nw in newwords and h**o !=1: nw = random.choice(sample.keys()) if nw in corpus: exist +=1 if nw in newwords: hom_count +=1 del same_length[len(w)][nw] tot_p += poolxP[nw] + log(1/float(len(sample))) newwords += [nw]
def extractNE(tok, pnouns, dic): names = LinkedList.LinkedList() nameprofs = defaultdict(lambda: defaultdict(lambda: 0)) #bag of words model tent = [] prevword = "" #usually names are not preceded by an article - filters out some other named entities and some other cases for i in range(len(tok)): if STok.isuc(tok[i][0]) and (tok[i].lower() in pnouns) and (prevword not in articles): tent.append(tok[i]) else: if len(tent) > 0 and type(tent): #will add if the named entity match = matchNE(names, tent) #matches to most recent matching occurrence for j in range(0, len(weights)): try: word = tok[i+j].lower() if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic): nameprofs[match][word] += weights[j] except: break for j in range(0, len(weights)): try: word = tok[i-len(tent)-j-1].lower() if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic): nameprofs[match][word] += weights[j] except: break tent = [] prevword = tok[i] return [names, nameprofs]
def make_mentions(self, words): ret = [] self.get_names() # get names any time mentions are made starts = defaultdict(list) ends = defaultdict(list) for w in words: start_ids = [ int(id.replace(')', '').replace('(', '')) for id in w.mention.split('|') if id.startswith('(') ] for id in start_ids: starts[id].append(w) end_ids = [ int(id.replace(')', '').replace('(', '')) for id in w.mention.split('|') if id.endswith(')') ] for id in end_ids: ends[id].append(w) for id in starts: while starts[id]: s = starts[id].pop() e = ends[id].pop() ret.append( Mention(words[s.index:e.index + 1], s.index, e.index, self.index, self.tree, id)) ret.reverse() # reverse so in sentence order return ret #Should these mentions be in breadth first tree search order?
def format_grammar_old(self, f): #print ">>> Loading cfg counts ..." f_out = open("grammars/grammar_formated.txt", 'w') gram = nltk.defaultdict(lambda: nltk.defaultdict(int)) tot = nltk.defaultdict(int) for line in f: freq, rule = line.split("\t") A, B = rule.split(" --> ") B = B.rstrip('\n') self.nonterminal_counts[A] += float(freq) #print A, B, len(B), freq if " " in B: #BINARY RULE B1, B2 = B.split() self.binary_rule_counts[(A, B1, B2)] = float(freq) gram[A][B1 + "+" + B2] = float(freq) tot[A] += float(freq) else: gram[A][B] = float(freq) tot[A] += float(freq) if len(B) == 1: #UNARY RULE self.unary_rule_counts[(A, B)] = float(freq) for i in gram.keys(): f_out.write(i + "\t") for j in sorted(gram[i].keys()): if j == sorted(gram[i].keys())[-1]: f_out.write(j + ":" + str(float(gram[i][j]) / tot[i])) else: f_out.write(j + ":" + str(float(gram[i][j]) / tot[i]) + " | ") f_out.write("\n")
def format_grammar(self, f, smoothing): tot = nltk.defaultdict(int) gram = nltk.defaultdict(lambda: nltk.defaultdict(int)) lines = f.readlines()[1:-1] for line in lines: p, rule = line.split("\t") A, B = rule.split(" --> ") B = B.rstrip('\n') gram[A][B] = float(p) tot[A] += float(p) # self.prod[A].append((B, float(p))) # self.nonterminal_counts[A] += float(p) #print A, B, len(B), freq # if " " in B: #BINARY RULE # B1, B2 = B.split() # self.binary_rule_counts[(A,B1,B2)] = float(p) # else: # if len(B) == 1: #UNARY RULE # self.unary_rule_counts[(A,B)] = float(p) for i in gram.keys(): for j in gram[i].keys(): p_new = (gram[i][j] + smoothing) / float(tot[i] + smoothing * len(gram[i].keys())) self.prod[i].append((j, p_new)) self.nonterminal_counts[i] += float(p_new) if " " in j: #BINARY RULE B1, B2 = j.split() self.binary_rule_counts[(i, B1, B2)] = float(p_new) else: if len(j) == 1: #UNARY RULE self.unary_rule_counts[(i, j)] = float(p_new)
def perm(lexicon, lang): iter =1000 lex = [i.strip().split("\t") for i in open(lexicon, "r").readlines()[1:]] #select only words length 3 to 7 selected_lex = [] for c in lex: if int(c[15]) >= 3 and int(c[15]) <=10: selected_lex += [c] print len(selected_lex) o = "celex_perm/sim_perm_" +lang +".txt" outfile = open(o, "w") outfile.write(",".join(["lex","word","count","prob","mps","length","lang"]) + "\n") freqs = nltk.defaultdict(list) for c in selected_lex: outfile.write(",".join(str(x) for x in [-1,c[0],c[1],c[2],c[3],c[15], lang]) + "\n") freqs[len(c[0])] += [c[1]] for i in range(iter): temp_freqs = nltk.defaultdict(list) for k in freqs.keys(): temp_freqs[k] = list(freqs[k]) random.shuffle(temp_freqs[k]) for c in selected_lex: freq_w = temp_freqs[len(c[0])][-1] del temp_freqs[len(c[0])][-1] outfile.write(",".join(str(x) for x in [i,c[0],freq_w,c[2],c[3],c[15], lang]) + "\n") print "generated lexicon: ", str(i) outfile.close()
def generate_correct_number(corpus, h**o, lm): """Generate number of words to match length, handle homophones being generated""" lengths = nltk.defaultdict(lambda: nltk.defaultdict(int)) lengths_needed = nltk.defaultdict(int) for item in corpus: lengths[get_cv(item)][len(item.split("-"))] += 1 lengths_needed[get_cv(item)] += 1 newwords = [] newwords2 = [] exist = nltk.defaultdict(int) while True: print(lengths_needed) words = lm.generate() for w in words: if lengths_needed[get_cv(w)] > 0: if h**o == 1 or (w not in newwords and re.sub("-", "", w) not in newwords2): lengths_needed[get_cv(w)] += -1 newwords += [w] newwords2 += [re.sub("-", "", w)] if w in corpus: exist[len(w)] += 1 elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0: print "nb of real words", sum(exist.values()) return newwords
def analysis_using_word_and_prev_pos(): from nltk.corpus import brown pos = nltk.defaultdict(lambda: nltk.defaultdict(int)) brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True) for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged): pos[(t1, w2)][t2] += 1 print pos[("DET", "right")]
def __init__(self): self.set_attributes = [] self.labels = [] self.count_wl = dict() self.count_l = defaultdict(lambda: 0) self.probability_wl = dict() self.probability_l = defaultdict(lambda: 0) pass
def multiple_label_intersection_score(y_pred, y_gold): """ this function will computer the score by including Example 1: y_pred=[{1,3},{2,8},{3,7,9}] y_gold=[{4,5,6},{2,4,8},{1,3,7}] we can see that for position 0, the intersection is none, so intersection size is 0, and the size of pred is 2, the size of gold is 3, thus the score is (0/2+0/3)/2=0. Similarly for position 2 is (2/2+2/3)/2=5/6, for position 3 is (2/3+2/3)/2=2/3 Example 2: it can also compute the score same way but for each label y_pred=[{1,6},{2,4},{3,9},{2,8},{3,4}] y_gold=[{4,5},{2,4},{1,3,7},{2,6},{4,5,10}] in this case we see that for label "1", it appears once in y_pred, and not in y_gold, since or appearing has a penalty, we minute score for that. So score for 1 is -1 Similarity, label "2" appears twice in y_pred, and each time it is in y_gold, thus accuracy for "2" is 1+1/2=1 Same way, for "3" is 1-1/2=0, for "4" is 1+1/2=1, for "6" is -1-1/2=-1, for "8" -1/2=-0.5 and "9" -1/2=-0.5 :param y_pred: a list of sets of predicted labels, must be same length as y_gold :param y_gold: a list of sets of labels, must be same length as y_pred :return: total_score: float of the total score calculated by example 1 label_wise_accuracy: a dictionary,where keys are labels, values are float score of the label calculated by example 2 """ assert len(y_pred) == len( y_gold), 'y_pred and y_gold need to have same length' label_wise_score = nltk.defaultdict(lambda: nltk.defaultdict(int)) All_score = [] for index, pred in enumerate(y_pred): pred = set(pred) gold = set(y_gold[index]) Intersection = pred.intersection(gold) forward_score = len(Intersection) / len(pred) backward_score = len(Intersection) / len(gold) score = (backward_score + forward_score) / 2 All_score.append(score) all = pred.union(gold) for label in all: if label in Intersection: label_wise_score[label]['total'] += 1 label_wise_score[label]['correct'] += 1 else: label_wise_score[label]['total'] += 1 label_wise_score[label]['correct'] -= 1 label_wise_accuracy = dict() for label in label_wise_score.keys(): try: rate = label_wise_score[label]['correct'] / label_wise_score[ label]['total'] except: rate = 0 label_wise_accuracy[label] = rate total_score = sum(All_score) / len(All_score) return total_score, label_wise_accuracy
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist): """A copy of the nltk.NaiveBayesClassifer.train(...) method to allow inspection of what the method is actually doing and how long it's taking""" """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = nltk.FreqDist() feature_freqdist = nltk.defaultdict(nltk.FreqDist) feature_values = nltk.defaultdict(set) fnames = set() print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets' # Count up how many times each feature value occured, given # the label and featurename. print 'Counting feature value occurence' i = 0 for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) print 'At featureset...' + str(i) i+=1 # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) # Create the P(label) distribution print 'Making the P(label) distribution...' label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution print 'Making the P(fval|label, fname) distribution from '\ + str(len(feature_freqdist.items()))\ + ' feature freqs...' feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist): """A copy of the nltk.NaiveBayesClassifer.train(...) method to allow inspection of what the method is actually doing and how long it's taking""" """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = nltk.FreqDist() feature_freqdist = nltk.defaultdict(nltk.FreqDist) feature_values = nltk.defaultdict(set) fnames = set() print "There are " + str(len(labeled_featuresets)) + " labeled featuresets" # Count up how many times each feature value occured, given # the label and featurename. print "Counting feature value occurence" i = 0 for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) print "At featureset..." + str(i) i += 1 # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples - count) feature_values[fname].add(None) # Create the P(label) distribution print "Making the P(label) distribution..." label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution print "Making the P(fval|label, fname) distribution from " + str( len(feature_freqdist.items()) ) + " feature freqs..." feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def word_valency(): table = nltk.defaultdict(lambda: nltk.defaultdict(set)) entries = nltk.corpus.ppattach.attachments("training") for entry in entries: # print entry key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2 table[key][entry.attachment].add(entry.verb) for key in sorted(table): if len(table[key]) > 1: print key, "N:", sorted(table[key]["N"]), "V:", sorted(table[key]["V"])
def word_valency(): table = nltk.defaultdict(lambda: nltk.defaultdict(set)) entries = nltk.corpus.ppattach.attachments("training") for entry in entries: # print entry key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2 table[key][entry.attachment].add(entry.verb) for key in sorted(table): if len(table[key]) > 1: print key, "N:", sorted(table[key]["N"]), "V:", sorted( table[key]["V"])
def __init__(self, grammar, ngram=None): self.__dict__.update(locals()) self.n = 0 self.ngram = ngram self.grammar_file = grammar self.nonterminal_counts = nltk.defaultdict(int) self.binary_rule_counts = nltk.defaultdict(int) self.unary_rule_counts = nltk.defaultdict(int) self.prod = nltk.defaultdict(list) self.reject = 0 LM.__init__(self)
def build_real_lex(path, lemma, language, mono, h**o, minlength, maxlength, freq, celex_list): celex_path = get_celex_path(path, lemma, language) print celex_path corpus = build_celex_corpus(celex_path, language, lemma, model, mono) print "number of monomorphemes:", len(corpus) corpus = [c for c in corpus if float(c[1]) > 0] print "number of words in lex after selecting words frequency > 0:", len(corpus) corpus = [(clean_word(c[0]), c[1], c[2]) for c in corpus] #reduce celex to just pronunciation corpus = [(celex_diphthong_sub(c[0]), c[1], c[2]) for c in corpus if "c" not in c[0] and "q" not in c[0] and "0" not in c[0] and "~" not in c[0] and "^" not in c[0] and "*" not in c[0] and "<" not in c[0] and ((language == 'english') | ("_" not in c[0]))] corpus = [c for c in corpus if (len(re.sub("-", "", c[0])) >= minlength and len(re.sub("-", "", c[0])) <= maxlength)] print "number of words in lex after cleaning pronunciation:", len(set(corpus)) ortho2ph = nltk.defaultdict(str) ortho = nltk.defaultdict(list) count = nltk.defaultdict(int) for c in corpus: ortho2ph[c[2]] = c[0] ortho[c[0]].append(c[2]) count[c[0]] += 1 dict_corpus = nltk.defaultdict(int) # print "nb of distinct orthographic forms", len(ortho2ph.keys()) # print "nb of distinct phonemic forms", len(ortho.keys()) # print "nb of words that share their phonological form with at least one other word", sum([i for i in count.values() if i > 1]) n = 0 for c in ortho.keys(): if len(set(ortho[c])) > 1: n += len(set(ortho[c])) for c in corpus: if not c[0] in dict_corpus: dict_corpus[c[0]] = float(c[1]) else: dict_corpus[c[0]] += float(c[1]) if h**o == 0: corpus = [(x, y) for x, y in dict_corpus.iteritems()] else: corpus = [(y, x) for x, y in ortho2ph.iteritems()] print ">>>TOTAL NB OF WORDS", len(corpus) f = open("celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w") if freq == 0: corpus = [c[0] for c in corpus] if args.pcfg == 0: for line in corpus: f.write(line+"\n") else: for line in corpus: l = list(line) for k in l: f.write(k + " ") f.write("\n") else: corpus = [(''.join(ortho[c[0]][0]),c[1]) for c in corpus] corpus = sorted(corpus, key=lambda x: abs(float(x[1]))) for c in corpus: f.write(c[0] + "\n") f.close() return corpus
def probability(c_wl, c_l, alpha=0.1): prob_wl = defaultdict(lambda: 0) prob_l = defaultdict(lambda: 0) for wl in c_wl.keys(): l = wl.split("_")[1] prob_wl[wl] = float(c_wl[wl]) / (c_l[l] + alpha * len(c_wl)) for l in c_l.keys(): prob_l[l] = float(c_l[l]) / np.sum(list(c_l.values())) return prob_wl, prob_l
def find_key_words_by_w2v_with_cc_titles( projectccdf, pretrained_map_dict, remain=.9, projectID_column='ProjectID', StatmentOfWork_column='StatementOfWork', cc_title_column='Title' ): """ This function is trying to find keywords in sow that have similar meaning of their cc titles using method of word2vec :param projectccdf: target dataframe, need to have projectID column, StatmentOfWork column and cc_title_column :param pretrained_map_dict: a dictionary where the keys are strings(words), and values are array of word vectors :param remain: a float, tell percentage of of words remain for each sow :param projectID_column: string, the name of projectID_column :param StatmentOfWork_column: string, the name of StatmentOfWork_column :param cc_title_column: string, the name of cc_title_column :return: a filtered dataframe, and a dict of filtered words for each projectID """ df = projectccdf.copy() OBJ = CreateDIYdictFromDataFrame(projectccdf) PID2TITLEDICT = OBJ.DIY_dict([projectID_column, cc_title_column], convert_to=set) PID2SOWDICT = OBJ.DIY_dict([projectID_column, StatmentOfWork_column], convert_to=set) removed_dict = nltk.defaultdict(lambda: nltk.defaultdict(dict)) pretrained = pretrained_map_dict dim = len(pretrained[list(pretrained.keys())[0]]) PID_filter_sow_dict = dict() for PID, title_set in tqdm(PID2TITLEDICT.items()): sow1 = list(PID2SOWDICT[PID])[0].split(' ') sow = [w for w in sow1 if w in pretrained] removed_not_in_pretrained = [w for w in sow1 if w not in pretrained] title = ' '.join(title_set).split(' ') title = [w for w in title if w in pretrained] word_dist_dict = nltk.defaultdict(lambda: 0) for sow_word in sow: sow_word_vec = pretrained[sow_word].reshape(1, dim) for title_word in title: title_word_vec = pretrained[title_word].reshape(1, dim) dist = cosine_distances(sow_word_vec, title_word_vec)[0][0] word_dist_dict[sow_word] += dist ranked_sow_word = [k for k, v in sorted(word_dist_dict.items(), key=lambda item: item[1])] ranked_sow_word_set = set(ranked_sow_word[:int(len(ranked_sow_word) * remain)]) removed_words_by_rank = set(ranked_sow_word).difference(ranked_sow_word_set) sow_string = ' '.join([w for w in sow if w in ranked_sow_word_set]) PID_filter_sow_dict[PID] = sow_string removed_dict[PID]['removed_not_in_pretrained'] = set(removed_not_in_pretrained) removed_dict[PID]['removed_words_by_dist'] = removed_words_by_rank for pid, text in tqdm(PID_filter_sow_dict.items()): df.loc[df[projectID_column] == pid, StatmentOfWork_column] = text return df, removed_dict
def main(): # accumulate words and word frequency distributions lines = [] unigramFD = nltk.FreqDist() st = LancasterStemmer() fin = open("tripadvisor_palazzo_reviews.txt", 'rb') for line in fin: line = nltk.clean_html(line) words = nltk.word_tokenize(line.strip().lower()) words = [w for w in words if not w in stopwords.words('english')] words = [ w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' ] words = [st.stem(w) for w in words] words = filter(lambda x: isValid(x), words) [unigramFD.inc(x) for x in words] lines.append(words) fin.close() # identify likely phrases using a multi-pass algorithm based # on the LLR approach described in the Building Search Applications # Lucene, LingPipe and GATE book, except that we treat n-gram # collocations beyond 2 as n-1 gram plus a unigram. phrases = nltk.defaultdict(float) prevGramFD = None for i in range(2, 5): ngramFD = nltk.FreqDist() for words in lines: nextGrams = nltk.ngrams(words, i) nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams) [ngramFD.inc(x) for x in nextGrams] for k, v in ngramFD.iteritems(): if v > 1: c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[ k[:-1]] c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[ len(k) - 1]] c12 = ngramFD[k] n = unigramFD.N() if prevGramFD == None else prevGramFD.N() phrases[k] = llr(c1, c2, c12, n) # only consider bigrams where LLR > 0, ie P(H1) > P(H0) likelyPhrases = nltk.defaultdict(float) likelyPhrases.update([(k, v) for (k, v) in phrases.iteritems() if len(k) == i and v > 0]) print "==== #-grams = %d ====" % (i) sortedPhrases = sorted(likelyPhrases.items(), key=operator.itemgetter(1), reverse=True) for k, v in sortedPhrases: print k, v prevGramFD = ngramFD
def generate_correct_number(corpus, h**o, lm): """Generate number of words to match length, handle homophones being generated""" lengths = nltk.defaultdict(lambda: nltk.defaultdict(int)) lengths_needed = nltk.defaultdict(int) for item in corpus: lengths[get_cv(item)][len(item.split("-"))] += 1 lengths_needed[get_cv(item)] += 1 newwords = [] newwords2 = [] exist = nltk.defaultdict(int) # print lengths_needed # hom = dict((i,corpus.count(i)) for i in corpus if corpus.count(i) > 1) # ratio = sum(hom.values())/float(len(hom)+1) # for i in lengths_needed.keys(): # while lengths_needed[i] > 0: # words = lm.generate(i) # for w in words: # if h**o == 1 or (w not in newwords and re.sub("-","",w) not in newwords2): # lengths_needed[i] -= 1 # newwords += [w] # newwords2 += [re.sub("-", "", w)] # if re.sub("-","",w) in corpus: # exist[len(re.sub("-","",w))] +=1 # print exist # return newwords while True: words = lm.generate() for w in words: if lengths_needed[get_cv(w)] > 0: if h**o == 1 or (w not in newwords and re.sub("-","",w) not in newwords2): # temp = newwords + [w] # hom_new = Set([(i,temp.count(i)) for i in temp if temp.count(i) > 1]) # if len(hom_new)!= 0: # ratio_temp = sum([h[1] for h in hom_new])/float(len(hom_new)) # else: # ratio_temp = 0 # if (ratio_temp <= (ratio + 0.1) and len(hom_new) <= len(hom)) or w not in newwords: lengths_needed[get_cv(w)] += -1 # if sum([lengths_needed[j] for j in lengths_needed.keys()]) %1000 == 0: # print sum([lengths_needed[j] for j in lengths_needed.keys()]) newwords += [w] newwords2 += [re.sub("-", "", w)] if w in corpus: exist[len(w)] +=1 elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0: print "nb of real words", sum(exist.values()) return newwords
def single_label_included_score(y_pred, y_gold): """ this function will computer the score by including Example 1: y_pred=[1,2,3] y_gold=[{4,5},{2,4},{1,3,7}] we can see that for position 0, label "1" is not in {4,5}, but for position 1 and 2 labels "2", "3" are in {2,4} and {1,3,7} respectively, in this case, the overall score is 2/3 Example 2: it can also compute the score same way but for each label y_pred=[1,2,3,2,3] y_gold=[{4,5},{2,4},{1,3,7},{2,6},{4,5}] in this case we see that for label "1", it appears once in y_pred, and not in y_gold thus accuracy for "1" is 0. Similarity, label "2" appears twice in y_pred, and each time it is in y_gold, thus accuracy for "2" is 1 Same way, for "3" is 1/2 :param y_pred: a list of predicted labels, must be same length as y_gold :param y_gold: a list of sets of labels, must be same length as y_pred :return: total_score: float of the total score calculated by example 1 label_wise_accuracy: a dictionary,where keys are labels, values are float score of the label calculated by example 2 """ assert len(y_pred) == len( y_gold), 'y_pred and y_gold need to have same length' count = 0 label_wise_score = nltk.defaultdict(lambda: nltk.defaultdict(int)) for index, pred in enumerate(y_pred): gold = set(y_gold[index]) if pred in gold: count += 1 label_wise_score[pred]['total'] += 1 label_wise_score[pred]['correct'] += 1 else: label_wise_score[pred]['total'] += 1 label_wise_accuracy = dict() for label in label_wise_score.keys(): try: rate = label_wise_score[label]['correct'] / label_wise_score[ label]['total'] except: rate = 0 label_wise_accuracy[label] = rate total_score = count / len(y_gold) return total_score, label_wise_accuracy
def _getInputDocs(self, Stops): ''' Takes an input of docs provided by user and performs the necessary calcs in order to make use of them in Kernal operations. ''' stops = self._getStopwords(Stops, lang=lang) main_dict = nltk.defaultdict(list) weight_dict = nltk.defaultdict(int) # need to include various options to allow for feeding in file names # or actualy documents; if they are providing sets of documents per- # taining to particular categories, and they want these averaged to- # gether for comparison purposes; etc. self._vecs = main_dict
def __init__(self, words_seq, unigram_freqs, bigram_freqs, d): self._unigrams = unigram_freqs self._bigrams = bigram_freqs self._cache = {} self._after = nltk.defaultdict(float) self._before = nltk.defaultdict(float) self._col_sums = nltk.defaultdict(float) self._length = 0 #len(words_seq) self._d = d for word_prev, word in self._bigrams: self._after[word_prev] += 1 # equiv. to adjacancy matrix row sums self._length += 1 # equiv. to adjacancy matrix NNZ self._before[word] += 1 #equiv. to adjacancy matrix col sums.
def _getBrownDocs(self, Stops): ''' Create reference distributions from the Brown corpus, seperated by cetegories. Non=tagged data (i.e., tags are removed). We assume that each category of docs is actually a single document of that category. This instance is mainly for test cases, and likely of little use to a user, except for basic comparisons. Also, this really only needs to be done once (unless stopwords are changed), since we can just pickle it and re-use it later. ''' stops = self._getStopwords(Stops, lang=lang) main_dict = nltk.defaultdict(list) weight_dict = nltk.defaultdict(int) # first, load the files from brown topics (minus the two small ones) for category in set(brown.categories()).\ difference(set(['humor', 'science_fiction'])): cat_files = brown.fileids(categories=category) key_list = [] # misleading; list of words encountered temp_weight_dict = nltk.defaultdict(int) for f in cat_files: temp = brown.open(f).read().split() # brown files are tagged, so get rid of that info, for now temp = [entry.split('/')[0] for entry in temp] temp = [entry for entry in temp if entry not in stops] main_dict[category].append(self._FDtoDIC(nltk.FreqDist(temp))) # update the weight dict for this category temp_weight_dict['__NUM__'] += 1 for entry in main_dict[category][-1].keys(): temp_weight_dict[entry] += 1 key_list.extend(main_dict[category][-1].keys()) key_list = set(key_list) cat_avg_dict = {} for word in key_list: score = 0.0 for fdd in main_dict[category]: score += float(fdd[word])/fdd['N'] cat_avg_dict[word] = float(score) / \ len(main_dict[category].keys()) main_dict[category].append(cat_avg_dict) # get weights for current category self._R_[category] = self._calcWeights(temp_weight_dict) # update the main weight dict for all docs for key in temp_weight_dict.keys(): weight_dict[key] += temp_weight_dict[key] self._R_['__ALL__'] = self._calcWeights(weight_dict) ## need to add this... main_dict['__ALL__'] = ... self._vecs = main_dict
def ch05_34_num_words_with_1to10_distinct_tags(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") # number of distinct tags and number of words in corpus for this dd = nltk.defaultdict(set) for w,t in tagged_words: dd[w].add(t) for i in range(1,10): print i, len(filter(lambda x: len(dd[x]) == i, dd.keys())) # for the word with greatest number of tags, print out concordance # one for each tag maxtags = 6 word = None tags = None for w in dd.keys(): if len(dd[w]) >= maxtags: word = w tags = dd[w] break poss = [] pos = 0 for w, t in tagged_words: if w == word and t in tags: poss.append((t, pos)) tags.remove(t) pos += 1 for t, pos in poss: print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
def getSimWordNet(word, text=None, num_word=40, num=5): """ This function uses the nltk.similar(word) function to create a dictionary with keys being the top "num_word" words in nltk.similar("word"). Entries are a list of tuples consisting of the top "num" word for nltk.similar(word) for each key and the corresponding inverse ranking (1/1, 1/2, ..., 1/num) of the word in the nltk.similar(key) list. Instances of "word" in any of these lists are ignored. """ text = text if text else nltk.Text(brown.words()) text = textprocesses.TextProcess(text) num += 1 # accounts for pos of 'word' in sim simWords = text.getsimilar(word, num_word) simWords = removePunc(simWords) # need to make this mod wordNetDict = nltk.defaultdict(list) for w in simWords: wSim = text.getsimilar(w, num) # remove word from sim. list if present try: wSim.remove(word) except: pass # create entry for w using the first num words in wSim for s in wSim[:num-1]: wordNetDict[w].append((s,1.0/num)) return wordNetDict
def nMostFreq(N, words): wCounts = nltk.defaultdict(int) nCounts = nltk.defaultdict(int) for word in words: wCounts[word.lower()] += 1 for key in wCounts.keys(): nCounts[wCounts[key]] += 1 tot = 0 numStop = [] while tot<N: numStop.append(max(nCounts.keys())) tot += nCounts.pop(max(nCounts.keys())) revWCounts = getReverseDict(wCounts) wordsN = [] for num in numStop: wordsN.extend(revWCount[num])
def analyze_simple(trie, sent, connect_func=lambda x, y: True): """ trie 構造から形態素が接続できるかどうかで node を作成し、 作成した node から形態素の接続可能なすべての組み合わせを返す """ bos_node = {'next':[], 'entry': _BOS_ENTRY} # ... (1) end_node_list = nltk.defaultdict(list) # ... (2) end_node_list[0].append(bos_node) for i in range(0, len(sent)+1): # ... (6) if i < len(sent): cps_results = common_prefix_search(trie, sent[i:].encode('utf-8')) else: # EOS cps_results = [_EOS_ENTRY] for centry in cps_results: cnode = {'next': [], 'entry': centry} for bnode in end_node_list[i]: if connect_func(bnode, cnode): # ... (3) bnode['next'].append(cnode) # ... (5) end_nodes = end_node_list[i+centry['length']] if not cnode in end_nodes: end_nodes.append(cnode) # ... (4) print('-' * 72) pprint(bos_node) print('-' * 72) return enum_solutions(bos_node) # ... (7)
def load(path_to_dict): # Although pystardict.Dictionary is a child class of dict, it doesn't # implement quite a few important basic method such as keys(), iterkeys() # and etc, so we cannot just simply iterate through it. raw_dict = Dictionary(path_to_dict); new_dict = defaultdict(tuple) size = float(len(raw_dict)) count = 0 # This is a workaround to iterate through the keys # NB Since the idx stores the offset-size pairs, its keys must be sorted in # order to read the dictionary data linearly and gain the best performance for tuple_key in sorted(raw_dict.idx._idx): key = ''.join(tuple_key) value = raw_dict[key] # Convert value to set of French words value = re.sub(r'\d\. ?', '', value); value = re.split(r', | \n ', value); new_dict[key] = value; # Show a nice progress report count += 1 print 'Loading dictionary... %5.2f%%\r' % ((count / size) * 100), sys.stdout.flush() # this must be flushed to see the latest result print return new_dict
def invert_dict(d): from nltk import defaultdict inverted_dict = defaultdict(list) for key in d: for term in d[key]: inverted_dict[term].append(key) return inverted_dict
def parseLexicon(lex_str): primitives = [] families = {} entries = defaultdict(list) for line in lex_str.splitlines(): # Strip comments and leading/trailing whitespace. line = reComm.match(line).groups()[0].strip() if line == "": continue if line.startswith(':-'): # A line of primitive categories. # The first line is the target category # ie, :- S, N, NP, VP primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ] else: # Either a family definition, or a word definition (ident, sep, catstr) = reLex.match(line).groups() (cat, var) = augParseCategory(catstr, primitives, families) if sep == '::': # Family definition # ie, Det :: NP/N families[ident] = (cat, var) else: # Word definition # ie, which => (N\N)/(S/NP) entries[ident].append(cat) return CCGLexicon(primitives[0], primitives, families, entries)
def _w_b(word, overview): pos_forms = defaultdict(list) words = word.split(',') words = [w.strip() for w in words] for pos_str in ['noun', 'verb', 'adj', 'adv']: for w in words: ''' if overview: pos_forms[pos_str].append(w) else: for form in _morphy(w, pos=pos_str): if form not in pos_forms[pos_str]: pos_forms[pos_str].append(form) ''' for form in _morphy(w, pos=pos_str): if form not in pos_forms[pos_str]: pos_forms[pos_str].append(form) body = '' for pos,pos_str,name in \ ((N,'noun','Noun'), (V,'verb','Verb'), (ADJ,'adj','Adjective'), (ADV,'adv','Adverb')): if pos_str in pos_forms: if not overview: body += _hlev(3, name) + '\n' for w in pos_forms[pos_str]: # Not all words of exc files are in the database, so: try: body += _collect_all(w, pos) except KeyError: pass if not body: word = None return word,body
def mk_reldicts(pairs, window=5, trace=0): """ Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which stores information about the subject and object NEs plus the filler between them. Additionally, a left and right context of length =< window are captured (within a given input sentence). @param pairs: a pair of list(str) and L{Tree}, as generated by @param window: a threshold for the number of items to include in the left and right context @type window: C{int} @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' @rtype: C{list} of C{defaultdict} """ result = [] while len(pairs) > 2: reldict = defaultdict(str) reldict['lcon'] = _join(pairs[0][0][-window:]) reldict['subjclass'] = pairs[0][1].node reldict['subjtext'] = _join(pairs[0][1].leaves()) reldict['subjsym'] = list2sym(pairs[0][1].leaves()) reldict['filler'] = _join(pairs[1][0]) reldict['objclass'] = pairs[1][1].node reldict['objtext'] = _join(pairs[1][1].leaves()) reldict['objsym'] = list2sym(pairs[1][1].leaves()) reldict['rcon'] = _join(pairs[2][0][:window]) if trace: print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass']) result.append(reldict) pairs = pairs[1:] return result
def extract_real_lex(path, lemma, language, mono, hom,minlength, maxlength, minsyll, maxsyll, match, celex_list): celex_path = get_celex_path(path, lemma, language) lengths = nltk.defaultdict(int) print celex_path corpus = build_celex_corpus(celex_path, language, lemma, mono) print ">>>TOTAL NB OF WORDS", len(corpus) corpus = [c for c in corpus if c[1] > 0] #freq greater than 0 corpus = [clean_word(c[0]) for c in corpus] #reduce celex to just pronunciation corpus = [celex_diphthong_sub(c) for c in corpus if "c" not in c and "q" not in c and "0" not in c and "~" not in c] print ">>>TOTAL NB OF WORDS", len(corpus) corpus = [i for i in corpus if (len(i.split("-")) > minsyll and len(i.split("-")) < maxsyll)] print ">>>TOTAL NB OF WORDS", len(corpus) corpus = [i for i in corpus if (len(re.sub("-", "", i)) > minlength and len(re.sub("-", "", i)) < maxlength)] print ">>>TOTAL NB OF WORDS", len(corpus) if match == "length": corpus = [clean_syll(c) for c in corpus] #reduce celex to just pronunciation print ">>>TOTAL NB OF WORDS", len(corpus) if hom == 0: corpus = list(set(corpus)) print ">>>TOTAL NB OF WORDS", len(corpus) f = open("kyle_celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w") for line in corpus: lengths[len(re.sub("-", "", line))] +=1 f.write(line + "\n") f.close() print ">>> Word-Length frequencies of lexicon to match" for k in lengths.keys(): print k, lengths[k] return corpus
def word_count(): from nltk.corpus import brown counts = nltk.defaultdict(int) for (word, tag) in brown.tagged_words(categories="news"): counts[tag] += 1 from operator import itemgetter print sorted(counts.items(), key=itemgetter(1), reverse=True)
def avg_tf_idf(question, supporting_facts, rest): tf_idf_scores = tf_idf(question, supporting_facts, rest) sf_scores = defaultdict(int) rest_scores = defaultdict(int) for w, scores in tf_idf_scores.items(): for i, score in enumerate(scores[0]): sf_scores[i] += score for i, score in enumerate(scores[1]): rest_scores[i] += score num_query_terms = len(tf_idf_scores.keys()) for i, s in sf_scores.items(): sf_scores[i] = s / num_query_terms for i, s in rest_scores.items(): rest_scores[i] = s / num_query_terms return sf_scores, rest_scores
def find_all_names(stoplist): ROOT = ['nltk'] logger._verbosity = 0 docindex = epydoc.docbuilder.build_doc_index(ROOT, add_submodules=True) valdocs = sorted(docindex.reachable_valdocs( imports=False, #packages=False, bases=False, submodules=False, #subclasses=False, private=False)) logger._verbosity = 5 names = nltk.defaultdict(list) n = 0 for valdoc in valdocs: name = valdoc.canonical_name if (name is not epydoc.apidoc.UNKNOWN and name is not None and name[0] == 'nltk'): n += 1 for i in range(len(name)): key = str(name[i:]) if len(key) == 1: continue if key in stoplist: continue names[key].append(valdoc) log.info('Found %s names from %s objects' % (len(names), n)) return names
def normalize(text): remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) # word tokenization word_token = nltk.word_tokenize(text.lower().translate(remove_punct_dict)) # remove ascii new_words = [] for word in word_token: new_word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('utf-8', 'ignore') new_words.append(new_word) # Remove tags rmv = [] for w in new_words: text = re.sub("</?.*?>", "<>", w) rmv.append(text) # pos tagging and lemmatization tag_map = nltk.defaultdict(lambda: wn.NOUN) tag_map['J'] = wn.ADJ tag_map['V'] = wn.VERB tag_map['R'] = wn.ADV lemmatizer = nltk.WordNetLemmatizer() lemma_list = [] rmv = [i for i in rmv if i] for token, tag in nltk.pos_tag(rmv): lemma = lemmatizer.lemmatize(token, tag_map[tag[0]]) lemma_list.append(lemma) return lemma_list
def get_similar_groups(word_list, minimum): tri_list=get_all_pairs_similarity(word_list) tri_filtered=filter_pairs_similarity(tri_list, minimum) neighbor=nltk.defaultdict(set) for tri in tri_filtered: neighbor[tri[0]].add(tri[1]) neighbor[tri[1]].add(tri[0]) def bors_kerbosch_v1(R, P, X, G, C): #CODE FROM ONLINE RESOURCE if len(P) == 0 and len(X) == 0: if len(R) > 2: C.append(sorted(R)) return for v in P.union(set([])): bors_kerbosch_v1(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C) P.remove(v) X.add(v) def bors_kerbosch_v2(R, P, X, G, C): #CODE FROM ONLINE RESOURCE if len(P) == 0 and len(X) == 0: if len(R) > 2: C.append(sorted(R)) return (d, pivot) = max([(len(G[v]), v) for v in P.union(X)]) for v in P.difference(G[pivot]): bors_kerbosch_v2(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C) P.remove(v) X.add(v) C = [] bors_kerbosch_v2(set([]),set(neighbor.keys()),set([]),neighbor,C) return C
def ch05_33_list_pos_of_word_given_word_and_pos(): from nltk.corpus import brown tagged_words = brown.tagged_words(categories="news") tagged_word_bigrams = nltk.bigrams(tagged_words) dd = nltk.defaultdict(dict) for (w1,t1), (w2,t2) in tagged_word_bigrams: dd[w1][t1] = t2 print dd
def get_vector(stem_list): vector = nltk.defaultdict(int) for stem in stem_list: vector[stem] += 1 return vector
def add_unk(text, textlist, freq_dist, threshold_num=0): mapping = nltk.defaultdict(lambda: 'UNK') for v in textlist: if freq_dist[v] > threshold_num: mapping[v] = v text_with_unk = [mapping[v] for v in textlist] text_unk_set = set(text_with_unk) return text_with_unk
def buildPronounciationDictionary(): pron_entries = nltk.corpus.cmudict.entries() pron = nltk.defaultdict(list) for entry in pron_entries: pron[entry[0]] = entry[1] pron['syrians'] = [] # word syrians has weird pronounciation return pron
def compDiv1(lid, ust, leg_dict, fam): import math div = 0.0 cnt_genus = nltk.defaultdict(float) ust = ust.replace(";","") ust = ust.replace("/","_") ust = ust.replace(" ","_") ust = ust.replace("-","_") # ust = ust.replace(")","_") ust = ust.replace("\n","") # print ust k = get_bipartition(ust) parts = k[0] lang_set = k[1] denom = math.log(len(lang_set)) for bipart1 in parts: bipart2 = lang_set - bipart1 l1 = len(bipart1)*1.0 l2 = len(bipart2)*1.0 cnt_genus = nltk.defaultdict() for lang_no in bipart1: lang = leg_dict[fam][lang_no] genus = lid[lang] if genus in cnt_genus: cnt_genus[lid[lang]] += 1.0 else: cnt_genus[lid[lang]] = 1.0 for key in cnt_genus.iterkeys(): p = cnt_genus[key]/l1 div += p*math.log(p) cnt_genus = nltk.defaultdict() for lang in bipart2: lang = leg_dict[fam][lang_no] genus = lid[lang] if genus in cnt_genus: cnt_genus[lid[lang]] += 1.0 else: # print lang cnt_genus[lid[lang]] = 1.0 for key in cnt_genus.iterkeys(): p = cnt_genus[key]/l2 # print key, p div += p*math.log(p) ediv = -div/denom return str(ediv)
def main(): # accumulate words and word frequency distributions lines = [] unigramFD = nltk.FreqDist() st = LancasterStemmer() fin = open("tripadvisor_palazzo_reviews.txt", 'rb') for line in fin: line = nltk.clean_html(line) words = nltk.word_tokenize(line.strip().lower()) words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] words = [st.stem(w) for w in words] words = filter(lambda x: isValid(x), words) [unigramFD.inc(x) for x in words] lines.append(words) fin.close() # identify likely phrases using a multi-pass algorithm based # on the LLR approach described in the Building Search Applications # Lucene, LingPipe and GATE book, except that we treat n-gram # collocations beyond 2 as n-1 gram plus a unigram. phrases = nltk.defaultdict(float) prevGramFD = None for i in range(2, 5): ngramFD = nltk.FreqDist() for words in lines: nextGrams = nltk.ngrams(words, i) nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams) [ngramFD.inc(x) for x in nextGrams] for k, v in ngramFD.iteritems(): if v > 1: c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]] c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]] c12 = ngramFD[k] n = unigramFD.N() if prevGramFD == None else prevGramFD.N() phrases[k] = llr(c1, c2, c12, n) # only consider bigrams where LLR > 0, ie P(H1) > P(H0) likelyPhrases = nltk.defaultdict(float) likelyPhrases.update([(k, v) for (k, v) in phrases.iteritems() if len(k) == i and v > 0]) print "==== #-grams = %d ====" % (i) sortedPhrases = sorted(likelyPhrases.items(), key=operator.itemgetter(1), reverse=True) for k, v in sortedPhrases: print k, v prevGramFD = ngramFD
def tag_most_frequent_words(): alice = nltk.corpus.gutenberg.words("carroll-alice.txt") vocab = nltk.FreqDist(alice) v1000 = list(vocab)[:1000] mapping = nltk.defaultdict(lambda: "UNK") for v in v1000: mapping[v] = v alice2 = [mapping[v] for v in alice] print alice2[:100]
def getTagsPerWord(tagged=[], wordTags=None, opts=0): wordTags = wordTags if wordTags else getWordTagTypes(tagged) tagCounts = nltk.defaultdict(int) for key in wordTags.keys(): tagCounts[key] = len(wordTags[key]) if not opts: return tagCounts else: return wordCounts, tagCounts
def incrementally_update(): counts = nltk.defaultdict(int) for (word, tag) in brown.tagged_words(categories='news', simplify_tags=True): counts[tag] += 1 print counts['N'] print list(counts) aa = sorted(counts.items(), key=itemgetter(1), reverse=True) print aa print [t for t, c in aa]
def train(labeled_featuresets, estimator=ELEProbDist): """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occured, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return NaiveBayesClassifier(label_probdist, feature_probdist)