示例#1
0
    def train(self, corpus, alpha=0.1):
        self.set_attributes = list()
        for key in corpus[0][0].keys():
            self.set_attributes.append(key)
        self.labels = list(set([l for w, l in corpus]))

        self.count_wl = dict()
        for i in self.set_attributes:
            self.count_wl[i] = dict()
            for l in self.labels:
                self.count_wl[i][l] = defaultdict(lambda: 0)

        self.probability_wl = dict()
        for i in self.set_attributes:
            self.probability_wl[i] = dict()
            for l in self.labels:
                self.probability_wl[i][l] = defaultdict(lambda: 0)

        self.count_l = defaultdict(lambda: 0)
        for fs, l in corpus:
            for key, val in fs.items():
                self.count_wl[key][l][val] += 1
            self.count_l[l] += 1

        for key in self.labels:
            self.probability_l[key] = float(self.count_l[key]) / np.sum(
                list(self.count_l.values()))
def makePercentDict():
    decimalDict = nltk.defaultdict(lambda: 0)  # 1 6 0 6
    exponentDict = nltk.defaultdict(lambda: 0)  # 1 6 0 6
    lnDict = nltk.defaultdict(lambda: 0)  # 1 6 0 6

    getcontext().prec = 20

    for gram in allDict:
        if allDict.has_key(gram):
            value = allDict.get(gram)
            new_value = Decimal((float(value) / float(total)))
            #gram 6a6, count 1
            #print new_value #prints full decimal 0.000001750280482447312181777129785048053950645590955950691

            exponents = expm1(
                new_value)  #prints with exponenets 1.75028201419e-06

            #logged_value = log(float(new_value)) #Math Domain Error #because log(0)
            if new_value == 0:
                logged_value = "Undefined"
            else:
                logged_value = Decimal(new_value).ln()  #log

            decimalDict[gram] = new_value
            exponentDict[gram] = exponents
            lnDict[gram] = logged_value

    return decimalDict, exponentDict, lnDict
 def __init__(self,corpus=None,alpha=0.1):
     # if corpus!=None:
     self.c_uvs=defaultdict(lambda :0)
     self.c_uv=defaultdict(lambda :0)
     self.c_xs=defaultdict(lambda :0)
     self.c_s=defaultdict(lambda :0)
     self.alpha=alpha
示例#4
0
    def constructEmissionMatrix(self, sourceFilesList: list):
        # construction of the emission matrix
        emission = defaultdict(dict)
        for tag in NE_TAG_lABELS:
            emission[tag] = defaultdict(float)
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding='windows-1256')
            for line in file:
                words = re.split("\s+", line)
                entite = ''
                for word in words:
                    word = self.stemmer.stem(word)
                    if (re.findall('[A-Z]+', word) == []):
                        entite = word

                        continue
                    if not word in emission:
                        emission[word] = defaultdict(float)

                    emission[word][entite] += 1

            file.close()

        for tag in emission.keys():
            somme = 0.0
            for value in emission[tag].values():
                somme += value
            for word in emission[tag].keys():
                emission[tag][word] = round(
                    float("{0:.6f}".format(emission[tag][word] / somme)), 6)

        self.EMISSION_MATRIX = emission
        return emission
def write_lex_file_srilm(o, corpus, lex, lang, iter, order, smoothing, lm3):
    get_lm_model_srilm(corpus, lang, order, 'addsmooth', smoothing)

    outfile = open(o, "w")
    lengths_needed = nltk.defaultdict(int)
    freqs = nltk.defaultdict(list)

    for c in lex:
        outfile.write(",".join(
            str(x) for x in [-1, c[0], c[1], c[2], c[3], c[15]]) + "\n")
        lengths_needed[len(c[0])] += 1
        freqs[len(c[0])] += [c[1]]

    for i in range(iter):
        temp_freqs = nltk.defaultdict(list)
        for k in freqs.keys():
            temp_freqs[k] = list(freqs[k])
            random.shuffle(temp_freqs[k])
        gen_lex, freqs_lex = generate_correct_number_srilm(
            corpus, lang, lengths_needed, temp_freqs, order)
        mps_lex = get_mps(gen_lex)
        for w in gen_lex:
            outfile.write(",".join(
                str(x) for x in
                [i, w, freqs_lex[w],
                 lm3.evaluate(w)[2], mps_lex[w],
                 len(w)]) + "\n")
        print "generated lexicon: ", str(i)
    outfile.close()
    return o
示例#6
0
    def most_informative_features(self, n=100):
        """
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature C{(fname,fval)} is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label.

          max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        """
        # The set of (fname, fval) pairs used by this classifier.
        features = set()
        # The max & min probability associated w/ each (fname, fval)
        # pair.  Maps (fname,fval) -> float.
        maxprob = defaultdict(lambda: 0.0)
        minprob = defaultdict(lambda: 1.0)

        for (label, fname), probdist in self._feature_probdist.items():
            for fval in probdist.samples():
                feature = (fname, fval)
                features.add( feature )
                p = probdist.prob(fval)
                maxprob[feature] = max(p, maxprob[feature])
                minprob[feature] = min(p, minprob[feature])
                if minprob[feature] == 0:
                    features.discard(feature)

        # Convert features to a list, & sort it by how informative
        # features are.
        features = sorted(features, 
            key=lambda feature: minprob[feature]/maxprob[feature])
        return features[:n]
示例#7
0
def generate_correct_number_pool(n, corpus, h**o, pool, eps = 1e-6):
    """Generate number of words to match length and p(word) from ngram model using a pool a pre-generated word"""
    x = NgramModel(n, corpus, 1, h**o)
    poolxLengths = nltk.defaultdict(list)
    poolxP = nltk.defaultdict(float)

    for item in pool:
            item = item.strip().split(",")
            poolxLengths[int(item[0])].append(item[1])
            poolxP[item[1]] = float(item[2])
        
    same_length = nltk.defaultdict(int)
    for i in range(20):
            same_length[i] = dict([(k, poolxP[k]) for k in poolxLengths[i] if k in poolxP])
    newwords = []
    exist =0
    hom_count =0
    tot_p=0
    for i, w in enumerate(corpus):
        p_match = x.evaluate(w)
        sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
        while len(sample) == 0:
                eps = eps*2
                sample = get_range(same_length[len(w)], p_match-eps, p_match+eps)
        eps = 1e-6
        nw = random.choice(sample.keys())
        while nw in newwords and h**o !=1:
            nw = random.choice(sample.keys())
        if nw in corpus:
            exist +=1
        if nw in newwords:
            hom_count +=1
        del same_length[len(w)][nw]
        tot_p += poolxP[nw] + log(1/float(len(sample)))
            newwords += [nw]      
示例#8
0
def extractNE(tok, pnouns, dic):
    names = LinkedList.LinkedList()
    nameprofs = defaultdict(lambda: defaultdict(lambda: 0))
    #bag of words model
    tent = []
    prevword = "" #usually names are not preceded by an article - filters out some other named entities and some other cases
    for i in range(len(tok)):
        if STok.isuc(tok[i][0]) and (tok[i].lower() in pnouns) and (prevword not in articles):
            tent.append(tok[i])
        else:
            if len(tent) > 0 and type(tent): #will add if the named entity 
                match = matchNE(names, tent) #matches to most recent matching occurrence
                
                for j in range(0, len(weights)):
                    try:
                        word = tok[i+j].lower()
                        if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
                            nameprofs[match][word] += weights[j]
                    except:
                        break
                for j in range(0, len(weights)):
                    try:
                        word = tok[i-len(tent)-j-1].lower()
                        if (word not in pnouns) and len(word) > 3 and ("," not in word) and (word in dic):
                            nameprofs[match][word] += weights[j]
                    except:
                        break
                
            tent = []
            prevword = tok[i]
    return [names, nameprofs]
示例#9
0
    def make_mentions(self, words):
        ret = []
        self.get_names()  # get names any time mentions are made
        starts = defaultdict(list)
        ends = defaultdict(list)
        for w in words:
            start_ids = [
                int(id.replace(')', '').replace('(', ''))
                for id in w.mention.split('|') if id.startswith('(')
            ]
            for id in start_ids:
                starts[id].append(w)
            end_ids = [
                int(id.replace(')', '').replace('(', ''))
                for id in w.mention.split('|') if id.endswith(')')
            ]
            for id in end_ids:
                ends[id].append(w)

        for id in starts:
            while starts[id]:
                s = starts[id].pop()
                e = ends[id].pop()
                ret.append(
                    Mention(words[s.index:e.index + 1], s.index, e.index,
                            self.index, self.tree, id))
                ret.reverse()  # reverse so in sentence order
        return ret  #Should these mentions be in breadth first tree search order?
示例#10
0
    def format_grammar_old(self, f):
        #print ">>> Loading cfg counts ..."
        f_out = open("grammars/grammar_formated.txt", 'w')
        gram = nltk.defaultdict(lambda: nltk.defaultdict(int))
        tot = nltk.defaultdict(int)
        for line in f:
            freq, rule = line.split("\t")
            A, B = rule.split(" --> ")
            B = B.rstrip('\n')
            self.nonterminal_counts[A] += float(freq)
            #print A, B, len(B), freq
            if " " in B:  #BINARY RULE
                B1, B2 = B.split()
                self.binary_rule_counts[(A, B1, B2)] = float(freq)
                gram[A][B1 + "+" + B2] = float(freq)
                tot[A] += float(freq)
            else:
                gram[A][B] = float(freq)
                tot[A] += float(freq)
                if len(B) == 1:  #UNARY RULE
                    self.unary_rule_counts[(A, B)] = float(freq)

        for i in gram.keys():
            f_out.write(i + "\t")
            for j in sorted(gram[i].keys()):
                if j == sorted(gram[i].keys())[-1]:
                    f_out.write(j + ":" + str(float(gram[i][j]) / tot[i]))
                else:
                    f_out.write(j + ":" + str(float(gram[i][j]) / tot[i]) +
                                " | ")
            f_out.write("\n")
示例#11
0
    def format_grammar(self, f, smoothing):
        tot = nltk.defaultdict(int)
        gram = nltk.defaultdict(lambda: nltk.defaultdict(int))
        lines = f.readlines()[1:-1]
        for line in lines:
            p, rule = line.split("\t")
            A, B = rule.split(" --> ")
            B = B.rstrip('\n')
            gram[A][B] = float(p)
            tot[A] += float(p)


#            self.prod[A].append((B, float(p)))
#            self.nonterminal_counts[A] += float(p)
#print A, B, len(B), freq
#            if " " in B: #BINARY RULE
#                B1, B2 = B.split()
#                self.binary_rule_counts[(A,B1,B2)] = float(p)
#            else:
#                if len(B) == 1: #UNARY RULE
#                    self.unary_rule_counts[(A,B)] = float(p)
        for i in gram.keys():
            for j in gram[i].keys():
                p_new = (gram[i][j] + smoothing) / float(tot[i] + smoothing *
                                                         len(gram[i].keys()))
                self.prod[i].append((j, p_new))
                self.nonterminal_counts[i] += float(p_new)
                if " " in j:  #BINARY RULE
                    B1, B2 = j.split()
                    self.binary_rule_counts[(i, B1, B2)] = float(p_new)
                else:
                    if len(j) == 1:  #UNARY RULE
                        self.unary_rule_counts[(i, j)] = float(p_new)
示例#12
0
def perm(lexicon, lang):
	iter =1000
	lex = [i.strip().split("\t") for i in open(lexicon, "r").readlines()[1:]]
	
	#select only words length 3 to 7
	selected_lex = []
	for c in lex:
		if int(c[15]) >= 3 and int(c[15]) <=10:
			selected_lex += [c]

	print len(selected_lex)
	o = "celex_perm/sim_perm_" +lang +".txt"
	outfile  = open(o, "w")
	outfile.write(",".join(["lex","word","count","prob","mps","length","lang"]) + "\n")
	freqs = nltk.defaultdict(list)
		
	for c in selected_lex:
        	outfile.write(",".join(str(x) for x in [-1,c[0],c[1],c[2],c[3],c[15], lang]) + "\n")
		freqs[len(c[0])] += [c[1]]

	for i in range(iter):
    		temp_freqs = nltk.defaultdict(list)
		for k in freqs.keys():
			temp_freqs[k] = list(freqs[k])
			random.shuffle(temp_freqs[k])
	
		for c in selected_lex: 
			freq_w = temp_freqs[len(c[0])][-1]
			del temp_freqs[len(c[0])][-1] 
			outfile.write(",".join(str(x) for x in [i,c[0],freq_w,c[2],c[3],c[15], lang]) + "\n")
        	print "generated lexicon: ", str(i)
	outfile.close()
示例#13
0
def generate_correct_number(corpus, h**o, lm):
    """Generate number of words to match length, handle homophones being generated"""
    lengths = nltk.defaultdict(lambda: nltk.defaultdict(int))
    lengths_needed = nltk.defaultdict(int)

    for item in corpus:
        lengths[get_cv(item)][len(item.split("-"))] += 1
        lengths_needed[get_cv(item)] += 1
    newwords = []
    newwords2 = []
    exist = nltk.defaultdict(int)
    while True:
        print(lengths_needed)
        words = lm.generate()
        for w in words:
            if lengths_needed[get_cv(w)] > 0:
                if h**o == 1 or (w not in newwords
                                 and re.sub("-", "", w) not in newwords2):
                    lengths_needed[get_cv(w)] += -1
                    newwords += [w]
                    newwords2 += [re.sub("-", "", w)]
                    if w in corpus:
                        exist[len(w)] += 1
            elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0:
                print "nb of real words", sum(exist.values())
                return newwords
示例#14
0
def analysis_using_word_and_prev_pos():
    from nltk.corpus import brown
    pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories="news",
                                           simplify_tags=True)
    for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
        pos[(t1, w2)][t2] += 1
    print pos[("DET", "right")]
示例#15
0
def analysis_using_word_and_prev_pos():
    from nltk.corpus import brown

    pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
        pos[(t1, w2)][t2] += 1
    print pos[("DET", "right")]
示例#16
0
 def __init__(self):
     self.set_attributes = []
     self.labels = []
     self.count_wl = dict()
     self.count_l = defaultdict(lambda: 0)
     self.probability_wl = dict()
     self.probability_l = defaultdict(lambda: 0)
     pass
示例#17
0
def multiple_label_intersection_score(y_pred, y_gold):
    """
    this function will computer the score by including
    Example 1:
    y_pred=[{1,3},{2,8},{3,7,9}]
    y_gold=[{4,5,6},{2,4,8},{1,3,7}]
    we can see that for position 0, the intersection is none, so intersection size is 0,
    and the size of pred is 2, the size of gold is 3, thus the score is (0/2+0/3)/2=0.
    Similarly for position 2 is (2/2+2/3)/2=5/6, for position 3 is (2/3+2/3)/2=2/3

    Example 2:
    it can also compute the score same way but for each label
    y_pred=[{1,6},{2,4},{3,9},{2,8},{3,4}]
    y_gold=[{4,5},{2,4},{1,3,7},{2,6},{4,5,10}]
    in this case we see that for label "1", it appears once in y_pred, and not in y_gold,
    since or appearing has a penalty, we minute score for that. So score for 1 is -1
    Similarity, label "2" appears twice in y_pred, and each time it is in y_gold,
    thus accuracy for "2" is 1+1/2=1
    Same way, for "3" is 1-1/2=0, for "4" is 1+1/2=1, for "6" is -1-1/2=-1, for "8" -1/2=-0.5
    and "9" -1/2=-0.5

    :param y_pred: a list of sets of predicted labels, must be same length as y_gold
    :param y_gold: a list of sets of labels, must be same length as y_pred
    :return:
    total_score: float of the total score calculated by example 1
    label_wise_accuracy: a dictionary,where keys are labels, values are float score of the label
                        calculated by example 2
    """
    assert len(y_pred) == len(
        y_gold), 'y_pred and y_gold need to have same length'
    label_wise_score = nltk.defaultdict(lambda: nltk.defaultdict(int))
    All_score = []
    for index, pred in enumerate(y_pred):
        pred = set(pred)
        gold = set(y_gold[index])
        Intersection = pred.intersection(gold)
        forward_score = len(Intersection) / len(pred)
        backward_score = len(Intersection) / len(gold)
        score = (backward_score + forward_score) / 2
        All_score.append(score)
        all = pred.union(gold)
        for label in all:
            if label in Intersection:
                label_wise_score[label]['total'] += 1
                label_wise_score[label]['correct'] += 1
            else:
                label_wise_score[label]['total'] += 1
                label_wise_score[label]['correct'] -= 1
    label_wise_accuracy = dict()
    for label in label_wise_score.keys():
        try:
            rate = label_wise_score[label]['correct'] / label_wise_score[
                label]['total']
        except:
            rate = 0
        label_wise_accuracy[label] = rate
    total_score = sum(All_score) / len(All_score)
    return total_score, label_wise_accuracy
示例#18
0
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """ 
    label_freqdist = nltk.FreqDist() 
    feature_freqdist = nltk.defaultdict(nltk.FreqDist) 
    feature_values = nltk.defaultdict(set) 
    fnames = set() 

    print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets'
    # Count up how many times each feature value occured, given 
    # the label and featurename.
    print 'Counting feature value occurence'
    i = 0
    for featureset, label in labeled_featuresets: 
        label_freqdist.inc(label)
        for fname, fval in featureset.items(): 
            # Increment freq(fval|label, fname) 
            feature_freqdist[label, fname].inc(fval) 
            # Record that fname can take the value fval. 
            feature_values[fname].add(fval) 
            # Keep a list of all feature names. 
            fnames.add(fname)
        print 'At featureset...' + str(i)
        i+=1
   
    # If a feature didn't have a value given for an instance, then 
    # we assume that it gets the implicit value 'None.'  This loop 
    # counts up the number of 'missing' feature values for each 
    # (label,fname) pair, and increments the count of the fval 
    # 'None' by that amount. 
    for label in label_freqdist: 
        num_samples = label_freqdist[label] 
        for fname in fnames: 
            count = feature_freqdist[label, fname].N() 
            feature_freqdist[label, fname].inc(None, num_samples-count) 
            feature_values[fname].add(None) 
   
    # Create the P(label) distribution
    print 'Making the P(label) distribution...'
    label_probdist = estimator(label_freqdist) 

   
    # Create the P(fval|label, fname) distribution
    print 'Making the P(fval|label, fname) distribution from '\
    + str(len(feature_freqdist.items()))\
    + ' feature freqs...'
    feature_probdist = {} 
    for ((label, fname), freqdist) in feature_freqdist.items(): 
        probdist = estimator(freqdist, bins=len(feature_values[fname])) 
        feature_probdist[label,fname] = probdist 
                 
    return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
示例#19
0
def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """
    label_freqdist = nltk.FreqDist()
    feature_freqdist = nltk.defaultdict(nltk.FreqDist)
    feature_values = nltk.defaultdict(set)
    fnames = set()

    print "There are " + str(len(labeled_featuresets)) + " labeled featuresets"
    # Count up how many times each feature value occured, given
    # the label and featurename.
    print "Counting feature value occurence"
    i = 0
    for featureset, label in labeled_featuresets:
        label_freqdist.inc(label)
        for fname, fval in featureset.items():
            # Increment freq(fval|label, fname)
            feature_freqdist[label, fname].inc(fval)
            # Record that fname can take the value fval.
            feature_values[fname].add(fval)
            # Keep a list of all feature names.
            fnames.add(fname)
        print "At featureset..." + str(i)
        i += 1

    # If a feature didn't have a value given for an instance, then
    # we assume that it gets the implicit value 'None.'  This loop
    # counts up the number of 'missing' feature values for each
    # (label,fname) pair, and increments the count of the fval
    # 'None' by that amount.
    for label in label_freqdist:
        num_samples = label_freqdist[label]
        for fname in fnames:
            count = feature_freqdist[label, fname].N()
            feature_freqdist[label, fname].inc(None, num_samples - count)
            feature_values[fname].add(None)

    # Create the P(label) distribution
    print "Making the P(label) distribution..."
    label_probdist = estimator(label_freqdist)

    # Create the P(fval|label, fname) distribution
    print "Making the P(fval|label, fname) distribution from " + str(
        len(feature_freqdist.items())
    ) + " feature freqs..."
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = estimator(freqdist, bins=len(feature_values[fname]))
        feature_probdist[label, fname] = probdist

    return nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
示例#20
0
def word_valency():
    table = nltk.defaultdict(lambda: nltk.defaultdict(set))
    entries = nltk.corpus.ppattach.attachments("training")
    for entry in entries:
        #    print entry
        key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2
        table[key][entry.attachment].add(entry.verb)
    for key in sorted(table):
        if len(table[key]) > 1:
            print key, "N:", sorted(table[key]["N"]), "V:", sorted(table[key]["V"])
示例#21
0
def word_valency():
    table = nltk.defaultdict(lambda: nltk.defaultdict(set))
    entries = nltk.corpus.ppattach.attachments("training")
    for entry in entries:
        #    print entry
        key = entry.noun1 + "-" + entry.prep + "-" + entry.noun2
        table[key][entry.attachment].add(entry.verb)
    for key in sorted(table):
        if len(table[key]) > 1:
            print key, "N:", sorted(table[key]["N"]), "V:", sorted(
                table[key]["V"])
示例#22
0
 def __init__(self, grammar, ngram=None):
     self.__dict__.update(locals())
     self.n = 0
     self.ngram = ngram
     self.grammar_file = grammar
     self.nonterminal_counts = nltk.defaultdict(int)
     self.binary_rule_counts = nltk.defaultdict(int)
     self.unary_rule_counts = nltk.defaultdict(int)
     self.prod = nltk.defaultdict(list)
     self.reject = 0
     LM.__init__(self)
示例#23
0
def build_real_lex(path, lemma, language, mono, h**o, minlength, maxlength, freq, celex_list):
    celex_path = get_celex_path(path, lemma, language)
    print celex_path
    corpus = build_celex_corpus(celex_path, language, lemma, model, mono)
    print "number of monomorphemes:", len(corpus)
    corpus = [c for c in corpus if float(c[1]) > 0]
    print "number of words in lex after selecting words frequency > 0:", len(corpus)
    corpus = [(clean_word(c[0]), c[1], c[2]) for c in corpus] #reduce celex to just pronunciation
    corpus =  [(celex_diphthong_sub(c[0]), c[1], c[2]) for c in corpus if "c" not in c[0] and "q" not in c[0] and "0" not in c[0] and "~" not in c[0] and "^" not in c[0] and "*" not in c[0] and "<" not in c[0] and ((language == 'english') | ("_" not in c[0]))]
    corpus = [c for c in corpus if (len(re.sub("-", "", c[0])) >= minlength and len(re.sub("-", "", c[0])) <= maxlength)]
    print "number of words in lex after cleaning pronunciation:", len(set(corpus))
    ortho2ph = nltk.defaultdict(str)
    ortho = nltk.defaultdict(list)
    count = nltk.defaultdict(int)
    for c in corpus:
        ortho2ph[c[2]] = c[0]
        ortho[c[0]].append(c[2])
        count[c[0]] += 1
    dict_corpus = nltk.defaultdict(int)
#    print "nb of distinct orthographic forms", len(ortho2ph.keys())
#    print "nb of distinct phonemic forms", len(ortho.keys())
#    print "nb of words that share their phonological form with at least one other word", sum([i for i in count.values() if i > 1])
    n = 0
    for c in ortho.keys():
        if len(set(ortho[c])) > 1:
            n += len(set(ortho[c]))
    for c in corpus:
        if not c[0] in dict_corpus:
            dict_corpus[c[0]] = float(c[1])
        else:
            dict_corpus[c[0]] += float(c[1])
    
    if h**o == 0: 
        corpus = [(x, y) for x, y in dict_corpus.iteritems()]
    else:
        corpus = [(y, x) for x, y in ortho2ph.iteritems()]
    print ">>>TOTAL NB OF WORDS", len(corpus)   
    f = open("celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w")
    if freq == 0:
        corpus = [c[0] for c in corpus]
        if args.pcfg == 0:
            for line in corpus: f.write(line+"\n")
        else:
            for line in corpus: 
                l = list(line)
                for k in l:
                    f.write(k + " ")
                f.write("\n")
    else:
        corpus = [(''.join(ortho[c[0]][0]),c[1]) for c in corpus]
        corpus = sorted(corpus, key=lambda x: abs(float(x[1])))
        for c in corpus: f.write(c[0] + "\n")
    f.close()
    return corpus
示例#24
0
        def probability(c_wl, c_l, alpha=0.1):
            prob_wl = defaultdict(lambda: 0)
            prob_l = defaultdict(lambda: 0)

            for wl in c_wl.keys():
                l = wl.split("_")[1]
                prob_wl[wl] = float(c_wl[wl]) / (c_l[l] + alpha * len(c_wl))

            for l in c_l.keys():
                prob_l[l] = float(c_l[l]) / np.sum(list(c_l.values()))

            return prob_wl, prob_l
示例#25
0
def find_key_words_by_w2v_with_cc_titles(
    projectccdf,
    pretrained_map_dict,
    remain=.9,
    projectID_column='ProjectID',
    StatmentOfWork_column='StatementOfWork',
    cc_title_column='Title'
):
    """
    This function is trying to find keywords in sow that have similar meaning of their cc titles using method of
    word2vec

    :param projectccdf: target dataframe, need to have projectID column, StatmentOfWork column and cc_title_column
    :param pretrained_map_dict: a dictionary where the keys are strings(words), and values are array of word vectors
    :param remain: a float, tell percentage of of words remain for each sow
    :param projectID_column: string, the name of projectID_column
    :param StatmentOfWork_column: string, the name of StatmentOfWork_column
    :param cc_title_column: string, the name of cc_title_column
    :return:
    a filtered dataframe, and a dict of filtered words for each projectID
    """
    df = projectccdf.copy()
    OBJ = CreateDIYdictFromDataFrame(projectccdf)
    PID2TITLEDICT = OBJ.DIY_dict([projectID_column, cc_title_column], convert_to=set)
    PID2SOWDICT = OBJ.DIY_dict([projectID_column, StatmentOfWork_column], convert_to=set)
    removed_dict = nltk.defaultdict(lambda: nltk.defaultdict(dict))
    pretrained = pretrained_map_dict
    dim = len(pretrained[list(pretrained.keys())[0]])
    PID_filter_sow_dict = dict()
    for PID, title_set in tqdm(PID2TITLEDICT.items()):
        sow1 = list(PID2SOWDICT[PID])[0].split(' ')
        sow = [w for w in sow1 if w in pretrained]
        removed_not_in_pretrained = [w for w in sow1 if w not in pretrained]
        title = ' '.join(title_set).split(' ')
        title = [w for w in title if w in pretrained]
        word_dist_dict = nltk.defaultdict(lambda: 0)
        for sow_word in sow:
            sow_word_vec = pretrained[sow_word].reshape(1, dim)
            for title_word in title:
                title_word_vec = pretrained[title_word].reshape(1, dim)
                dist = cosine_distances(sow_word_vec, title_word_vec)[0][0]
                word_dist_dict[sow_word] += dist
        ranked_sow_word = [k for k, v in sorted(word_dist_dict.items(), key=lambda item: item[1])]
        ranked_sow_word_set = set(ranked_sow_word[:int(len(ranked_sow_word) * remain)])
        removed_words_by_rank = set(ranked_sow_word).difference(ranked_sow_word_set)
        sow_string = ' '.join([w for w in sow if w in ranked_sow_word_set])
        PID_filter_sow_dict[PID] = sow_string
        removed_dict[PID]['removed_not_in_pretrained'] = set(removed_not_in_pretrained)
        removed_dict[PID]['removed_words_by_dist'] = removed_words_by_rank
    for pid, text in tqdm(PID_filter_sow_dict.items()):
        df.loc[df[projectID_column] == pid, StatmentOfWork_column] = text
    return df, removed_dict
示例#26
0
def main():
    # accumulate words and word frequency distributions
    lines = []
    unigramFD = nltk.FreqDist()
    st = LancasterStemmer()
    fin = open("tripadvisor_palazzo_reviews.txt", 'rb')
    for line in fin:
        line = nltk.clean_html(line)
        words = nltk.word_tokenize(line.strip().lower())
        words = [w for w in words if not w in stopwords.words('english')]
        words = [
            w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        ]
        words = [st.stem(w) for w in words]
        words = filter(lambda x: isValid(x), words)
        [unigramFD.inc(x) for x in words]
        lines.append(words)
    fin.close()
    # identify likely phrases using a multi-pass algorithm based
    # on the LLR approach described in the Building Search Applications
    # Lucene, LingPipe and GATE book, except that we treat n-gram
    # collocations beyond 2 as n-1 gram plus a unigram.
    phrases = nltk.defaultdict(float)
    prevGramFD = None
    for i in range(2, 5):
        ngramFD = nltk.FreqDist()
        for words in lines:
            nextGrams = nltk.ngrams(words, i)
            nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
            [ngramFD.inc(x) for x in nextGrams]
        for k, v in ngramFD.iteritems():
            if v > 1:
                c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[
                    k[:-1]]
                c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[
                    len(k) - 1]]
                c12 = ngramFD[k]
                n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
                phrases[k] = llr(c1, c2, c12, n)
        # only consider bigrams where LLR > 0, ie P(H1) > P(H0)
        likelyPhrases = nltk.defaultdict(float)
        likelyPhrases.update([(k, v) for (k, v) in phrases.iteritems()
                              if len(k) == i and v > 0])
        print "==== #-grams = %d ====" % (i)
        sortedPhrases = sorted(likelyPhrases.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
        for k, v in sortedPhrases:
            print k, v
        prevGramFD = ngramFD
示例#27
0
def generate_correct_number(corpus, h**o, lm):
    """Generate number of words to match length, handle homophones being generated"""
    lengths = nltk.defaultdict(lambda: nltk.defaultdict(int))
    lengths_needed = nltk.defaultdict(int)

    for item in corpus:
        lengths[get_cv(item)][len(item.split("-"))] += 1
        lengths_needed[get_cv(item)] += 1
    newwords = []
    newwords2 = []
    exist = nltk.defaultdict(int)
#    print lengths_needed
#    hom = dict((i,corpus.count(i)) for i in corpus if corpus.count(i) > 1)
#    ratio = sum(hom.values())/float(len(hom)+1)

#    for i in lengths_needed.keys():
#        while lengths_needed[i] > 0:
#            words = lm.generate(i)
#            for w in words:
#                if h**o == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
#                    lengths_needed[i] -= 1
#                    newwords += [w]
#                    newwords2 += [re.sub("-", "", w)]
#                    if re.sub("-","",w) in corpus:
#                        exist[len(re.sub("-","",w))] +=1
#    print exist
#    return newwords

    while True:
        words = lm.generate()
        for w in words:
            if lengths_needed[get_cv(w)] > 0:
                if h**o == 1 or (w not in newwords and re.sub("-","",w) not in newwords2):
#                    temp = newwords + [w]
#                    hom_new = Set([(i,temp.count(i)) for i in temp if temp.count(i) > 1])
#                    if len(hom_new)!= 0: 
#                        ratio_temp = sum([h[1] for h in hom_new])/float(len(hom_new))
#                    else:
#                        ratio_temp = 0
#                    if (ratio_temp <= (ratio + 0.1) and len(hom_new) <= len(hom)) or w not in newwords:
                    lengths_needed[get_cv(w)] += -1
#                    if sum([lengths_needed[j] for j in lengths_needed.keys()]) %1000 == 0:
#                        print sum([lengths_needed[j] for j in lengths_needed.keys()])
                    newwords += [w]
                    newwords2 += [re.sub("-", "", w)]
                    if w in corpus:
                        exist[len(w)] +=1
            elif sum([lengths_needed[j] for j in lengths_needed.keys()]) == 0: 
                print "nb of real words", sum(exist.values())
                return newwords
示例#28
0
def single_label_included_score(y_pred, y_gold):
    """
    this function will computer the score by including
    Example 1:
    y_pred=[1,2,3]
    y_gold=[{4,5},{2,4},{1,3,7}]
    we can see that for position 0, label "1" is not in {4,5}, but for position 1 and 2
    labels "2", "3" are in {2,4} and {1,3,7} respectively, in this case, the overall
    score is 2/3

    Example 2:
    it can also compute the score same way but for each label
    y_pred=[1,2,3,2,3]
    y_gold=[{4,5},{2,4},{1,3,7},{2,6},{4,5}]
    in this case we see that for label "1", it appears once in y_pred, and not in y_gold
    thus accuracy for "1" is 0.
    Similarity, label "2" appears twice in y_pred, and each time it is in y_gold,
    thus accuracy for "2" is 1
    Same way, for "3" is 1/2

    :param y_pred: a list of predicted labels, must be same length as y_gold
    :param y_gold: a list of sets of labels, must be same length as y_pred
    :return:
    total_score: float of the total score calculated by example 1
    label_wise_accuracy: a dictionary,where keys are labels, values are float score of the label
                        calculated by example 2
    """
    assert len(y_pred) == len(
        y_gold), 'y_pred and y_gold need to have same length'
    count = 0
    label_wise_score = nltk.defaultdict(lambda: nltk.defaultdict(int))
    for index, pred in enumerate(y_pred):
        gold = set(y_gold[index])
        if pred in gold:
            count += 1
            label_wise_score[pred]['total'] += 1
            label_wise_score[pred]['correct'] += 1
        else:
            label_wise_score[pred]['total'] += 1
    label_wise_accuracy = dict()
    for label in label_wise_score.keys():
        try:
            rate = label_wise_score[label]['correct'] / label_wise_score[
                label]['total']
        except:
            rate = 0
        label_wise_accuracy[label] = rate
    total_score = count / len(y_gold)
    return total_score, label_wise_accuracy
示例#29
0
    def _getInputDocs(self, Stops):
        '''
        Takes an input of docs provided by user and performs the necessary
        calcs in order to make use of them in Kernal operations.  
        '''
        stops = self._getStopwords(Stops, lang=lang)
        main_dict = nltk.defaultdict(list)
        weight_dict = nltk.defaultdict(int)
        
        # need to include various options to allow for feeding in file names
        # or actualy documents; if they are providing sets of documents per-
        # taining to particular categories, and they want these averaged to-
        # gether for comparison purposes; etc.

        self._vecs = main_dict
示例#30
0
    def __init__(self, words_seq, unigram_freqs, bigram_freqs, d):
        self._unigrams = unigram_freqs
        self._bigrams = bigram_freqs
        self._cache = {}

        self._after = nltk.defaultdict(float)
        self._before = nltk.defaultdict(float)
        self._col_sums = nltk.defaultdict(float)
        self._length = 0  #len(words_seq)
        self._d = d

        for word_prev, word in self._bigrams:
            self._after[word_prev] += 1  # equiv. to adjacancy matrix row sums
            self._length += 1  # equiv. to adjacancy matrix NNZ
            self._before[word] += 1  #equiv. to adjacancy matrix col sums.
示例#31
0
 def _getBrownDocs(self, Stops):
     '''
     Create reference distributions from the Brown corpus,
     seperated by cetegories.  Non=tagged data (i.e., tags are
     removed).  We assume that each category of docs is actually a single
     document of that category.  This instance is mainly for test cases,
     and likely of little use to a user, except for basic comparisons.
     Also, this really only needs to be done once (unless stopwords are
     changed), since we can just pickle it and re-use it later.
     '''
     stops = self._getStopwords(Stops, lang=lang)
     main_dict = nltk.defaultdict(list)
     weight_dict = nltk.defaultdict(int)
     # first, load the files from brown topics (minus the two small ones)
     for category in set(brown.categories()).\
         difference(set(['humor', 'science_fiction'])):
         cat_files = brown.fileids(categories=category)
         key_list = []           # misleading; list of words encountered
         temp_weight_dict = nltk.defaultdict(int)
         for f in cat_files:
             temp = brown.open(f).read().split()
             # brown files are tagged, so get rid of that info, for now
             temp = [entry.split('/')[0] for entry in temp]
             temp = [entry for entry in temp if entry not in stops]
             main_dict[category].append(self._FDtoDIC(nltk.FreqDist(temp)))
             # update the weight dict for this category
             temp_weight_dict['__NUM__'] += 1
             for entry in main_dict[category][-1].keys():
                 temp_weight_dict[entry] += 1
             key_list.extend(main_dict[category][-1].keys())
             key_list = set(key_list)
             cat_avg_dict = {}
             for word in key_list:
                 score = 0.0
                 for fdd in main_dict[category]:
                     score += float(fdd[word])/fdd['N']
                 cat_avg_dict[word] = float(score) / \
                                      len(main_dict[category].keys())
             main_dict[category].append(cat_avg_dict)
         # get weights for current category
         self._R_[category] = self._calcWeights(temp_weight_dict)
         # update the main weight dict for all docs
         for key in temp_weight_dict.keys():
             weight_dict[key] += temp_weight_dict[key]
         self._R_['__ALL__'] = self._calcWeights(weight_dict)
         ## need to add this...
         main_dict['__ALL__'] = ...
     self._vecs = main_dict
示例#32
0
def ch05_34_num_words_with_1to10_distinct_tags():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  # number of distinct tags and number of words in corpus for this
  dd = nltk.defaultdict(set)
  for w,t in tagged_words:
    dd[w].add(t)
  for i in range(1,10):
    print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
  # for the word with greatest number of tags, print out concordance
  # one for each tag
  maxtags = 6
  word = None
  tags = None
  for w in dd.keys():
    if len(dd[w]) >= maxtags:
      word = w
      tags = dd[w]
      break
  poss = []
  pos = 0
  for w, t in tagged_words:
    if w == word and t in tags:
      poss.append((t, pos))
      tags.remove(t)
    pos += 1
  for t, pos in poss:
    print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
示例#33
0
def getSimWordNet(word, text=None, num_word=40, num=5):
    """
    This function uses the nltk.similar(word) function to create a dictionary
    with keys being the top "num_word" words in nltk.similar("word").  Entries
    are a list of tuples consisting of the top "num" word for nltk.similar(word)
    for each key and the corresponding inverse ranking (1/1, 1/2, ..., 1/num)
    of the word in the nltk.similar(key) list.  Instances of "word" in any of
    these lists are ignored.
    """
    text = text if text else nltk.Text(brown.words())
    text = textprocesses.TextProcess(text)
    num += 1                            # accounts for pos of 'word' in sim
    simWords = text.getsimilar(word, num_word)
    simWords = removePunc(simWords)     # need to make this mod
    wordNetDict = nltk.defaultdict(list)
    for w in simWords:
        wSim = text.getsimilar(w, num)
        # remove word from sim. list if present
        try:
            wSim.remove(word)
        except: pass
        # create entry for w using the first num words in wSim
        for s in wSim[:num-1]:
            wordNetDict[w].append((s,1.0/num))
    return wordNetDict
示例#34
0
def nMostFreq(N, words):
    wCounts = nltk.defaultdict(int)
    nCounts = nltk.defaultdict(int)
    for word in words:
        wCounts[word.lower()] += 1
    for key in wCounts.keys():
        nCounts[wCounts[key]] += 1
    tot = 0
    numStop = []
    while tot<N:
        numStop.append(max(nCounts.keys()))
        tot += nCounts.pop(max(nCounts.keys()))
    revWCounts = getReverseDict(wCounts)
    wordsN = []
    for num in numStop:
        wordsN.extend(revWCount[num])
示例#35
0
def analyze_simple(trie, sent, connect_func=lambda x, y: True):
    """
    trie 構造から形態素が接続できるかどうかで node を作成し、
    作成した node から形態素の接続可能なすべての組み合わせを返す
    """
    bos_node = {'next':[], 'entry': _BOS_ENTRY}  # ... (1)
    end_node_list = nltk.defaultdict(list)  # ... (2)
    end_node_list[0].append(bos_node)
    for i in range(0, len(sent)+1):  # ... (6)
        if i < len(sent):
            cps_results = common_prefix_search(trie, sent[i:].encode('utf-8'))
        else:
            # EOS
            cps_results = [_EOS_ENTRY]

        for centry in cps_results:
            cnode = {'next': [], 'entry': centry}
            for bnode in end_node_list[i]:
                if connect_func(bnode, cnode):  # ... (3)
                    bnode['next'].append(cnode)  # ... (5)
                    end_nodes = end_node_list[i+centry['length']]
                    if not cnode in end_nodes:
                        end_nodes.append(cnode)  # ... (4)

    print('-' * 72)
    pprint(bos_node)
    print('-' * 72)
    return enum_solutions(bos_node)  # ... (7)
示例#36
0
文件: load.py 项目: yochananmkp/clir
def load(path_to_dict):
    # Although pystardict.Dictionary is a child class of dict, it doesn't
    # implement quite a few important basic method such as keys(), iterkeys()
    # and etc, so we cannot just simply iterate through it.
    raw_dict = Dictionary(path_to_dict);
   
    new_dict = defaultdict(tuple)
    size = float(len(raw_dict))
    count = 0
   
    # This is a workaround to iterate through the keys
    # NB Since the idx stores the offset-size pairs, its keys must be sorted in 
    # order to read the dictionary data linearly and gain the best performance
    for tuple_key in sorted(raw_dict.idx._idx):
        key = ''.join(tuple_key)
        value = raw_dict[key]
        # Convert value to set of French words
        value = re.sub(r'\d\. ?', '', value);
        value = re.split(r', | \n ', value);
        new_dict[key] = value;
        # Show a nice progress report
        count += 1
        print 'Loading dictionary...       %5.2f%%\r' % ((count / size) * 100),
        sys.stdout.flush() # this must be flushed to see the latest result
        
    print
    return new_dict
示例#37
0
def invert_dict(d):
    from nltk import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        for term in d[key]:
            inverted_dict[term].append(key)
    return inverted_dict
def parseLexicon(lex_str):
    primitives = []
    families = {}
    entries = defaultdict(list)
    for line in lex_str.splitlines():
        # Strip comments and leading/trailing whitespace.
        line = reComm.match(line).groups()[0].strip()
        if line == "":
            continue

        if line.startswith(':-'):
            # A line of primitive categories.
            # The first line is the target category
            # ie, :- S, N, NP, VP
            primitives = primitives + [
                prim.strip() for prim in line[2:].strip().split(',')
            ]
        else:
            # Either a family definition, or a word definition
            (ident, sep, catstr) = reLex.match(line).groups()
            (cat, var) = augParseCategory(catstr, primitives, families)
            if sep == '::':
                # Family definition
                # ie, Det :: NP/N
                families[ident] = (cat, var)
            else:
                # Word definition
                # ie, which => (N\N)/(S/NP)
                entries[ident].append(cat)
    return CCGLexicon(primitives[0], primitives, families, entries)
示例#39
0
def _w_b(word, overview):
    pos_forms = defaultdict(list)
    words = word.split(',')
    words = [w.strip() for w in words]
    for pos_str in ['noun', 'verb', 'adj', 'adv']:
        for w in words:
            '''
            if overview:
                pos_forms[pos_str].append(w)
            else:
                for form in _morphy(w, pos=pos_str):
                    if form not in pos_forms[pos_str]:
                        pos_forms[pos_str].append(form)
            '''
            for form in _morphy(w, pos=pos_str):
                if form not in pos_forms[pos_str]:
                    pos_forms[pos_str].append(form)
    body = ''
    for pos,pos_str,name in \
        ((N,'noun','Noun'), (V,'verb','Verb'),
         (ADJ,'adj','Adjective'), (ADV,'adv','Adverb')):
        if pos_str in pos_forms:
            if not overview:
                body += _hlev(3, name) + '\n'
            for w in pos_forms[pos_str]:
                # Not all words of exc files are in the database, so:
                try:
                    body += _collect_all(w, pos)
                except KeyError:
                    pass
    if not body:
        word = None
    return word,body
示例#40
0
def mk_reldicts(pairs, window=5, trace=0):
    """
    Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
    stores information about the subject and object NEs plus the filler between them.
    Additionally, a left and right context of length =< window are captured (within 
    a given input sentence).
    
    @param pairs: a pair of list(str) and L{Tree}, as generated by 
    @param window: a threshold for the number of items to include in the left and right context
    @type window: C{int}
    @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
    @rtype: C{list} of C{defaultdict}
    """
    result = []
    while len(pairs) > 2:
        reldict = defaultdict(str)
        reldict['lcon'] = _join(pairs[0][0][-window:])
        reldict['subjclass'] = pairs[0][1].node
        reldict['subjtext'] = _join(pairs[0][1].leaves())
        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
        reldict['filler'] = _join(pairs[1][0])
        reldict['objclass'] = pairs[1][1].node
        reldict['objtext'] = _join(pairs[1][1].leaves())
        reldict['objsym'] = list2sym(pairs[1][1].leaves())
        reldict['rcon'] = _join(pairs[2][0][:window])
        if trace:
            print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])
        result.append(reldict)
        pairs = pairs[1:]
    return result
示例#41
0
def extract_real_lex(path, lemma, language, mono, hom,minlength, maxlength, minsyll, maxsyll, match, celex_list):
    celex_path = get_celex_path(path, lemma, language)
    lengths = nltk.defaultdict(int)
    print celex_path
    corpus = build_celex_corpus(celex_path, language, lemma, mono)
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [c for c in corpus if c[1] > 0] #freq greater than 0
    corpus = [clean_word(c[0]) for c in corpus] #reduce celex to just pronunciation
    corpus =  [celex_diphthong_sub(c) for c in corpus if "c" not in c and "q" not in c and "0" not in c and "~" not in c]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [i for i in corpus if (len(i.split("-")) > minsyll and len(i.split("-")) < maxsyll)]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    corpus = [i for i in corpus if (len(re.sub("-", "", i)) > minlength and len(re.sub("-", "", i)) < maxlength)]
    print ">>>TOTAL NB OF WORDS", len(corpus)
    if match == "length":
        corpus = [clean_syll(c) for c in corpus] #reduce celex to just pronunciation
    print ">>>TOTAL NB OF WORDS", len(corpus)
    if hom == 0: corpus = list(set(corpus))
    print ">>>TOTAL NB OF WORDS", len(corpus)
    f = open("kyle_celexes/" + "_".join([str(i) for i in celex_list]) + ".txt", "w")
    for line in corpus:
    	lengths[len(re.sub("-", "", line))] +=1
    	f.write(line + "\n")
    f.close()
    print ">>> Word-Length frequencies of lexicon to match"
    for k in lengths.keys():
		print k, lengths[k]
    return corpus
示例#42
0
def word_count():
    from nltk.corpus import brown
    counts = nltk.defaultdict(int)
    for (word, tag) in brown.tagged_words(categories="news"):
        counts[tag] += 1
    from operator import itemgetter
    print sorted(counts.items(), key=itemgetter(1), reverse=True)
示例#43
0
def avg_tf_idf(question, supporting_facts, rest):
    tf_idf_scores = tf_idf(question, supporting_facts, rest)
    sf_scores = defaultdict(int)
    rest_scores = defaultdict(int)
    for w, scores in tf_idf_scores.items():
        for i, score in enumerate(scores[0]):
            sf_scores[i] += score
        for i, score in enumerate(scores[1]):
            rest_scores[i] += score
    num_query_terms = len(tf_idf_scores.keys())
    for i, s in sf_scores.items():
        sf_scores[i] = s / num_query_terms

    for i, s in rest_scores.items():
        rest_scores[i] = s / num_query_terms
    return sf_scores, rest_scores
def parseLexicon(lex_str):
    primitives = []
    families = {}
    entries = defaultdict(list)
    for line in lex_str.splitlines():
        # Strip comments and leading/trailing whitespace.
        line = reComm.match(line).groups()[0].strip()
        if line == "":
            continue

        if line.startswith(':-'):
            # A line of primitive categories.
            # The first line is the target category
            # ie, :- S, N, NP, VP
            primitives = primitives + [
                prim.strip() for prim in line[2:].strip().split(',')
            ]
        else:
            # Either a family definition, or a word definition
            (ident, sep, catstr) = reLex.match(line).groups()
            (cat, var) = augParseCategory(catstr, primitives, families)
            if sep == '::':
                # Family definition
                # ie, Det :: NP/N
                families[ident] = (cat, var)
            else:
                # Word definition
                # ie, which => (N\N)/(S/NP)
                entries[ident].append(cat)
    return CCGLexicon(primitives[0], primitives, families, entries)
示例#45
0
def find_all_names(stoplist):
    ROOT = ['nltk']
    logger._verbosity = 0
    docindex = epydoc.docbuilder.build_doc_index(ROOT, add_submodules=True)
    valdocs = sorted(docindex.reachable_valdocs(
        imports=False,
        #packages=False, bases=False, submodules=False,
        #subclasses=False,
        private=False))
    logger._verbosity = 5
    names = nltk.defaultdict(list)
    n = 0
    for valdoc in valdocs:
        name = valdoc.canonical_name
        if (name is not epydoc.apidoc.UNKNOWN and
            name is not None and name[0] == 'nltk'):
            n += 1
            for i in range(len(name)):
                key = str(name[i:])
                if len(key) == 1: continue
                if key in stoplist: continue
                names[key].append(valdoc)

    log.info('Found %s names from %s objects' % (len(names), n))

    return names
示例#46
0
def normalize(text):
    remove_punct_dict = dict((ord(punct), None)
                             for punct in string.punctuation)
    # word tokenization
    word_token = nltk.word_tokenize(text.lower().translate(remove_punct_dict))

    # remove ascii
    new_words = []
    for word in word_token:
        new_word = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)

    # Remove tags
    rmv = []
    for w in new_words:
        text = re.sub("&lt;/?.*?&gt;", "&lt;&gt;", w)
        rmv.append(text)

    # pos tagging and lemmatization
    tag_map = nltk.defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    lemmatizer = nltk.WordNetLemmatizer()
    lemma_list = []
    rmv = [i for i in rmv if i]
    for token, tag in nltk.pos_tag(rmv):
        lemma = lemmatizer.lemmatize(token, tag_map[tag[0]])
        lemma_list.append(lemma)
    return lemma_list
def get_similar_groups(word_list, minimum):
    tri_list=get_all_pairs_similarity(word_list)
    tri_filtered=filter_pairs_similarity(tri_list, minimum)
    neighbor=nltk.defaultdict(set)
    for tri in tri_filtered:
        neighbor[tri[0]].add(tri[1])
        neighbor[tri[1]].add(tri[0])

    def bors_kerbosch_v1(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
        if len(P) == 0 and len(X) == 0:
            if len(R) > 2:
                C.append(sorted(R))
            return    
        for v in P.union(set([])):
            bors_kerbosch_v1(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
            P.remove(v)
            X.add(v)

    def bors_kerbosch_v2(R, P, X, G, C): #CODE FROM ONLINE RESOURCE
        if len(P) == 0 and len(X) == 0:
            if len(R) > 2:
                C.append(sorted(R))
            return
        (d, pivot) = max([(len(G[v]), v) for v in P.union(X)])                  
        for v in P.difference(G[pivot]):
            bors_kerbosch_v2(R.union(set([v])), P.intersection(G[v]), X.intersection(G[v]), G, C)
            P.remove(v)
            X.add(v)
    C = []
    bors_kerbosch_v2(set([]),set(neighbor.keys()),set([]),neighbor,C)
    return C
示例#48
0
def ch05_33_list_pos_of_word_given_word_and_pos():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  tagged_word_bigrams = nltk.bigrams(tagged_words)
  dd = nltk.defaultdict(dict)
  for (w1,t1), (w2,t2) in tagged_word_bigrams:
    dd[w1][t1] = t2
  print dd
示例#49
0
def get_vector(stem_list):

	vector = nltk.defaultdict(int)
	
	for stem in stem_list:
		vector[stem] += 1
	
	return vector
示例#50
0
def add_unk(text, textlist, freq_dist, threshold_num=0):
    mapping = nltk.defaultdict(lambda: 'UNK')
    for v in textlist:
        if freq_dist[v] > threshold_num:
            mapping[v] = v
    text_with_unk = [mapping[v] for v in textlist]
    text_unk_set = set(text_with_unk)
    return text_with_unk
示例#51
0
def buildPronounciationDictionary():
	pron_entries = nltk.corpus.cmudict.entries()
	pron = nltk.defaultdict(list)
	for entry in pron_entries:
		pron[entry[0]] = entry[1]
	pron['syrians'] = [] # word syrians has weird pronounciation

	return pron
示例#52
0
def compDiv1(lid, ust, leg_dict, fam):
	import math
	div = 0.0
	cnt_genus = nltk.defaultdict(float)
	ust = ust.replace(";","")
	ust = ust.replace("/","_")
	ust = ust.replace(" ","_")
	ust = ust.replace("-","_")
#	ust = ust.replace(")","_")
	ust = ust.replace("\n","")
#	print ust
	k = get_bipartition(ust)
	parts = k[0]
	lang_set = k[1]
	denom = math.log(len(lang_set))
	for bipart1 in parts:
		bipart2 = lang_set - bipart1			
		l1 = len(bipart1)*1.0
		l2 = len(bipart2)*1.0
		cnt_genus = nltk.defaultdict()
		for lang_no in bipart1:
			lang = leg_dict[fam][lang_no]
			genus = lid[lang]
			if genus in cnt_genus:
				cnt_genus[lid[lang]] += 1.0
			else:
				cnt_genus[lid[lang]] = 1.0
		for key in cnt_genus.iterkeys():
			p = cnt_genus[key]/l1
			div += p*math.log(p)
		cnt_genus = nltk.defaultdict()
		for lang in bipart2:
			lang = leg_dict[fam][lang_no]
			genus = lid[lang]			
			if genus in cnt_genus:
				cnt_genus[lid[lang]] += 1.0
			else:
#				print lang
				cnt_genus[lid[lang]] = 1.0
		for key in cnt_genus.iterkeys():
			p = cnt_genus[key]/l2
#			print key, p
			div += p*math.log(p)
	ediv = -div/denom
	return str(ediv)
示例#53
0
def main():
    # accumulate words and word frequency distributions
    lines = []
    unigramFD = nltk.FreqDist()
    st = LancasterStemmer()
    fin = open("tripadvisor_palazzo_reviews.txt", 'rb')
    for line in fin:
        line = nltk.clean_html(line)
        words = nltk.word_tokenize(line.strip().lower())
        words = [w for w in words if not w in stopwords.words('english')]
        words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
        words = [st.stem(w) for w in words]
        words = filter(lambda x: isValid(x), words)
        [unigramFD.inc(x) for x in words]
        lines.append(words)
    fin.close()
    # identify likely phrases using a multi-pass algorithm based
    # on the LLR approach described in the Building Search Applications
    # Lucene, LingPipe and GATE book, except that we treat n-gram
    # collocations beyond 2 as n-1 gram plus a unigram.
    phrases = nltk.defaultdict(float)
    prevGramFD = None
    for i in range(2, 5):
        ngramFD = nltk.FreqDist()
        for words in lines:
            nextGrams = nltk.ngrams(words, i)
            nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
            [ngramFD.inc(x) for x in nextGrams]
        for k, v in ngramFD.iteritems():
            if v > 1:
                c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]]
                c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]]
                c12 = ngramFD[k]
                n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
                phrases[k] = llr(c1, c2, c12, n)
        # only consider bigrams where LLR > 0, ie P(H1) > P(H0)
        likelyPhrases = nltk.defaultdict(float)
        likelyPhrases.update([(k, v) for (k, v)
            in phrases.iteritems() if len(k) == i and v > 0])
        print "==== #-grams = %d ====" % (i)
        sortedPhrases = sorted(likelyPhrases.items(),
            key=operator.itemgetter(1), reverse=True)
        for k, v in sortedPhrases:
            print k, v
        prevGramFD = ngramFD
示例#54
0
def tag_most_frequent_words():
    alice = nltk.corpus.gutenberg.words("carroll-alice.txt")
    vocab = nltk.FreqDist(alice)
    v1000 = list(vocab)[:1000]
    mapping = nltk.defaultdict(lambda: "UNK")
    for v in v1000:
        mapping[v] = v
    alice2 = [mapping[v] for v in alice]
    print alice2[:100]
示例#55
0
def getTagsPerWord(tagged=[], wordTags=None, opts=0):
    wordTags = wordTags if wordTags else getWordTagTypes(tagged)
    tagCounts = nltk.defaultdict(int)
    for key in wordTags.keys():
        tagCounts[key] = len(wordTags[key])
    if not opts:
        return tagCounts
    else:
        return wordCounts, tagCounts
示例#56
0
def incrementally_update():
    counts = nltk.defaultdict(int)
    for (word, tag) in brown.tagged_words(categories='news', simplify_tags=True):
        counts[tag] += 1
    print counts['N']
    print list(counts)
    aa = sorted(counts.items(), key=itemgetter(1), reverse=True)
    print aa
    print [t for t, c in aa]
示例#57
0
def word_count():
    from nltk.corpus import brown

    counts = nltk.defaultdict(int)
    for (word, tag) in brown.tagged_words(categories="news"):
        counts[tag] += 1
    from operator import itemgetter

    print sorted(counts.items(), key=itemgetter(1), reverse=True)
示例#58
0
    def train(labeled_featuresets, estimator=ELEProbDist):
        """
        @param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples C{(featureset, label)}.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occured, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)
                # Record that fname can take the value fval.
                feature_values[fname].add(fval)
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                feature_freqdist[label, fname].inc(None, num_samples-count)
                feature_values[fname].add(None)

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)