示例#1
0
def create_word_scores(posWords, negWords):

    word_fd = FreqDist()  #可统计所有词的词频
    print(type(word_fd))
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['pos'].inc(word)
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  #积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores  #包括了每个词和这个词的信息量
示例#2
0
def get_high_information_words(labelled_words,
                               score_fn=BigramAssocMeasures.chi_sq,
                               min_score=5):
    '''
    Gets the high information words using chi square measure
    '''
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]
        high_info_words |= set(bestwords)

    return high_info_words
示例#3
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # Prepare the data
        data = []
        tags = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        for s in train_data:
            start = ["<s>"]
            start.extend([tag for (word, tag) in s])
            start.extend(["</s>"])
            tags.extend(start)

        for i in range(len(tags) - 1):
            data.append((tags[i], tags[i + 1]))

        # Compute the transition model
        transition_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD,
                                                 lidstone_estimator)

        return self.transition_PD
示例#4
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        data = []
        for sent in train_data:  #for each sentence
            for tuples in sent:  #for each pair of (word,tag) in every sentence
                data.append(
                    (tuples[1], tuples[0].lower()))  #list of tuples(tag,word)

        emission_FD = ConditionalFreqDist(data)
        # this is the estiamtor used for probability distribution
        est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01,
                                                   emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, est)
        self.states = emission_FD.keys()
        #print(self.states[0])

        return self.emission_PD, self.states
示例#5
0
def create_word_scores():
    posWords = json.load(open('p.json', 'r'))
    negWords = json.load(open('n.json', 'r'))

    posWords = list(itertools.chain(*posWords))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  #同理

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  #积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_word_count),
            total_word_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)  #同理
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  #包括了每个词和这个词的信息量
示例#6
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')
        # TODO: prepare the data
        data = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL <s> and the END SYMBOL </s>

        for s in train_data:
            assert (len(s) > 0)
            data.append(('s', s[0][1]))
            for i in range(len(s) - 1):
                data.append((s[i][1], s[i + 1][1]))
            data.append((s[len(s) - 1][1], '/s'))

        # TODO compute the transition model
        cfdist = ConditionalFreqDist(data)
        cpdist = ConditionalProbDist(cfdist, MyProbDist, 13)
        transition_FD = cpdist
        self.transition_PD = transition_FD
        #print(self.tlprob('VERB','VERB'))
        #exit()
        return self.transition_PD
def generate_conditional_prob_dist(training_passage, n):
    """Given a passage generates ngrams and then subsequently decrements n, where n >= 2 """
    
    ## removing special character and symbols and converting to lower case
    training_passage = re.sub(r"[^\w\'\?]", ' ', training_passage).lower()
    
    ## tokenizing the sanitized passage
    words = nltk.word_tokenize(training_passage)
    
    cfdist_list = []
    cpdist_list = []
    
    ## generating cpdist and n_grams for n_plus_one to bigrams
    for i in range(n, 1, -1):
        ## generating n_plus_one_grams and converting into list
        n_grams_generated = list(ngrams(words, i))
        
        ## converting into (n_gram, n+1 words) for prediction
        n_grams_for_predict = [(n_gram[:-1], n_gram[-1]) for n_gram in n_grams_generated] 
        
        ## calculating conditionalfrequency for all n_grams
        cfdist = ConditionalFreqDist(n_grams_for_predict)
        
        ## calculating conditional probablitlity of next word for all n_grams
        cpdist = ConditionalProbDist(cfdist, MLEProbDist)
        
        cfdist_list.append(cfdist)
        cpdist_list.append(cpdist)
    
    return cpdist_list
示例#8
0
文件: db.py 项目: danfairs/synt
    def store_freqdists(self):
        """
        Build NLTK frequency distributions based on feature counts and store them to Redis.
        """
        #TODO: this step and the above may possibly be combined

        word_fd = FreqDist()
        label_word_freqdist = ConditionalFreqDist()

        pos_words = self.r.zrange('positive_wordcounts',
                                  0,
                                  -1,
                                  withscores=True,
                                  desc=True)
        neg_words = self.r.zrange('negative_wordcounts',
                                  0,
                                  -1,
                                  withscores=True,
                                  desc=True)

        assert pos_words and neg_words, 'Requires wordcounts to be stored in redis.'

        #build a condtional freqdist with the feature counts per label
        for word, count in pos_words:
            word_fd.inc(word, count)
            label_word_freqdist['positive'].inc(word, count)

        for word, count in neg_words:
            word_fd.inc(word, count)
            label_word_freqdist['negative'].inc(word, count)

        self.pickle_store('word_fd', word_fd)
        self.pickle_store('label_fd', label_word_freqdist)
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            if os.path.exists(cache_path):
                bestwords = pickle.load(open(cache_path, 'r'))
                print 'Loaded from cache'
                print 'bestwords count = %d' % (len(bestwords))
                return bestwords
    
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    pos_contents = contents[labels == 1]
    neg_contents = contents[labels != 0]
    
    pos_words = set()
    neg_words = set()
    
    for pos_content in pos_contents:
        pos_words = pos_words.union(word_tokenize(pos_content))
    
    for neg_content in neg_contents:
        neg_words = neg_words.union(word_tokenize(neg_content))
    
    for word in pos_words:
        word_fd.inc(word.lower())
        label_word_fd['pos'].inc(word.lower())
    
    for word in neg_words:
        word_fd.inc(word.lower())
        label_word_fd['neg'].inc(word.lower())
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
    bestwords = set([w for w, s in best])
    
    print 'all words count = %d' % (len(word_scores))
    print 'bestwords count = %d' % (len(bestwords))
    
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            f = open(cache_path, 'w')
            pickle.dump(bestwords, f)
            print 'Dumped to cache'
    
    return bestwords
示例#10
0
 def findName(self, mostCommon=5):
     if self.name != 0:
         self.cfdName = ConditionalFreqDist(
             (word.lower(), tag) for (word, tag) in self.wsj)
         return [self.name, self.cfdName[self.name].most_common(mostCommon)]
     else:
         print("invalid method")
示例#11
0
def create_word_scores(pos_words, neg_words, pos_tag, neg_tag):
    pos_words = list(itertools.chain(*pos_words))
    neg_words = list(itertools.chain(*neg_words))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos_words:
        word_fd[word] += 1
        cond_word_fd[pos_tag][word] += 1
    for word in neg_words:
        word_fd[word] += 1
        cond_word_fd[neg_tag][word] += 1

    pos_word_count = cond_word_fd[pos_tag].N()
    neg_word_count = cond_word_fd[neg_tag].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[pos_tag][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[neg_tag][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
示例#12
0
 def findAllTags(self, mostCommon=5):
     self.cfdTagAll = ConditionalFreqDist(
         (tag, word) for (word, tag) in self.wsj)
     for tag in sorted(self.cfdTagAll):
         print(tag, self.cfdTagAll[tag].most_common())
         #print(self.cfdTagAll)
     return dict(self.cfdTagAll)
示例#13
0
def create_word_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet'
    posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
示例#14
0
def create_word_scores(sentences):
    # logging.info(sentences)
    words = list(itertools.chain(*sentences))
    # logging.info(words)

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in words:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
示例#15
0
def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(
                n_ii, (n_ix, n_xi), n_xx
            )  # n_ii is occurances in a label, n_ix is occurance in total,
            # n_xi is total words in this category, n_xx total words
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]

        high_info_words |= set(bestwords)  # bitwise or operation

    return high_info_words
示例#16
0
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500):
    fd = FreqDist()
    cond_fd = ConditionalFreqDist()
    word_counts = {}

    for cat in categories:
        for word in corpus.words(categories=[cat]):
            word = word.lower().strip(".!?:,/ ")
            if not word.isalpha() or word in stopset:
                continue
            fd.inc(word)
            cond_fd[cat].inc(word)

        word_counts[cat] = cond_fd[cat].N()

    total_word_count = sum(word_counts.values())

    word_scores = collections.defaultdict(int)
    for word, freq in fd.iteritems():
        for cat in categories:
            cat_word_score = BigramAssocMeasures.chi_sq(
                cond_fd[cat][word], (freq, word_counts[cat]), total_word_count)
            word_scores[word] += cat_word_score

    informative_words = sorted(word_scores.iteritems(),
                               key=lambda (w, s): s,
                               reverse=True)[:count]
    return set([w for w, s in informative_words])
示例#17
0
 def Ae_kappa(self, cA, cB):
     Ae = 0.0
     nitems = float(len(self.I))
     label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
     for k in label_freqs.conditions():
         Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
     return Ae
示例#18
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        data = []
        #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data]
        for sent in train_data:
            for (word, tag) in sent:
                data.append((tag, word.lower()))
                self.states.append(tag)

        emission_FD = ConditionalFreqDist(data)

        lidstone_estimator = lambda emission_FD: LidstoneProbDist(
            emission_FD, 0.01,
            emission_FD.B() + 1)

        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(set(self.states))

        return self.emission_PD, self.states
示例#19
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # raise NotImplementedError('HMM.emission_model')
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = []
        for sent in train_data:
            sent_parsed = list(map(lambda x: (x[1], x[0].lower()), sent))
            data.extend(sent_parsed)

        # TODO compute the emission model

        #print('pair num:', len(data))
        cfdist = ConditionalFreqDist(data)
        #print(cfdist.conditions())
        #print(len(dict(cfdist['ADP'])))
        cpdist = ConditionalProbDist(cfdist, myProbDist1, 0.01)
        emission_FD = cpdist
        self.emission_PD = emission_FD
        self.states = list(cfdist.conditions())
        #print(self.elprob('VERB','is'))
        #exit()
        return self.emission_PD, self.states
示例#20
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        for idx, s in enumerate(train_data):
            train_data[idx].insert(0, ('<s>', '<s>'))
            train_data[idx].insert(-1, ('<\s>', '<\s>'))

        tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1))
                         for s in train_data)
        data = itertools.chain.from_iterable(tagGenerators)

        transition_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda emission_FD: LidstoneProbDist(
            emission_FD, 0.01,
            emission_FD.B() + 1)

        self.transition_PD = ConditionalProbDist(transition_FD,
                                                 lidstone_estimator)

        return self.transition_PD
示例#21
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        data = []
        for sent in train_data:
            data.append(("<s>", sent[0][1]))  #start symbol
            for i in range(len(sent) - 1):
                data.append((sent[i][1], sent[i + 1][1]))
            data.append((sent[len(sent) - 1][1], "</s>"))  #end symbol

        transition_FD = ConditionalFreqDist(data)
        #same estimator used for emission_model
        est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01,
                                                     transition_FD.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD, est)

        return self.transition_PD
示例#22
0
    def __init__(self, labeled_sequence, states, transform, alpha1, alpha2,
                 gammaPrior, gammaEmission):
        self.init = FreqDist()
        self.transition_bigram = ConditionalFreqDist()
        self.transition_unigram = FreqDist()
        self.emission = ConditionalFreqDist()

        # hyper-parameters for smoothing
        self.alpha1 = alpha1
        self.alpha2 = alpha2
        self.gammaPrior = gammaPrior
        self.gammaEmission = gammaEmission

        self.states = states
        self.symbols = []
        self.labeled_sequence = transform(labeled_sequence)
示例#23
0
def create_word_bigram_scores():
    posdata = get_word('static/pos.txt')
    negdata = get_word('static/neg.txt')
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    posbigram_finder = BigramCollocationFinder.from_words(posWords)
    negbigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = posbigram_finder.nbest(BigramAssocMeasures.chi_sq, number)
    negBigrams = negbigram_finder.nbest(BigramAssocMeasures.chi_sq, number)
    pos = posWords + posBigrams  #词和双词搭配
    neg = negWords + negBigrams
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
示例#24
0
    def __setTermsCHISQUARE__(self, size):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()

        for word in self.reader.words(categories=['pos']):
            word_fd.inc(word.lower())
            label_word_fd['pos'].inc(word.lower())

        for word in self.reader.words(categories=['neg']):
            word_fd.inc(word.lower())
            label_word_fd['neg'].inc(word.lower())

        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        wordScores = {}

        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            wordScores[word] = pos_score + neg_score

        termScore = sorted(wordScores.items(),
                           key=lambda (w, s): s,
                           reverse=True)[:size]
        self.terms = [w for (w, s) in termScore]
def main():
    train_file_path = sys.argv[1]  # Get the training file path from arguments
    tokens = get_training_data(train_file_path)  # get all the word tag pair
    tag_freq_dist = FreqDist(
        tag for (word, tag) in tokens)  # get the frequency of all tags
    word_tag_freq_dist = ConditionalFreqDist(
        (word, tag) for word, tag in tokens)  #
    proximity_pairs = nltk.bigrams(tokens)  # compute the bigrams of tags
    tag_tag_confidence = nltk.ConditionalFreqDist(
        (a[1], b[1]) for (a, b) in proximity_pairs
    )  # compute frequency of occurance of prev tag to current tag
    test_file_path = sys.argv[2]  # Get the test file path from arguments
    test_tokens = get_test_data(test_file_path)  # get all the words of test
    result_tagged = []  # store each word and their associated tags
    for i in range(len(test_tokens)):
        if i == 0:  # when there is no previous tag
            result_tagged.append(
                (test_tokens[i],
                 get_tagged(test_tokens[i],
                            prev_tag=None,
                            word_tag_freq_dist=word_tag_freq_dist,
                            tag_tag_confidence=tag_tag_confidence,
                            tag_freq_dist=tag_freq_dist)))
        else:
            result_tagged.append(
                (test_tokens[i],
                 get_tagged(test_tokens[i],
                            result_tagged[i - 1][1],
                            word_tag_freq_dist=word_tag_freq_dist,
                            tag_tag_confidence=tag_tag_confidence,
                            tag_freq_dist=tag_freq_dist)))
            # Rule 1: Tag every word that contains number to "CD"
            if re.match("(\d+(\.\d+)?)", test_tokens[i]) is not None:
                result_tagged[-1] = (result_tagged[-1][0], "CD")
            # Rule 2: If current tag is DT and previous word is "all", change the tag of "all" to "PDT" (Pre Determiner)
            if (result_tagged[-1][1] == "DT" and test_tokens[i - 1] == "all"):
                result_tagged[-2] = ("all", "PDT")
            # Rule 3: If current word is tagged NN i.e. singular noun and the word is capitalized, change the tag to "NNP" i.e. Proper Noun
            if (result_tagged[-1][1] == "NN" and test_tokens[i][0].isupper()):
                result_tagged[-1] = (result_tagged[-1][0], "NNP")
            # Rule 4: If current word is VBN i.e. past participle verb and previous word was capitalized then change the current tag to VBD i.e. past tense verb
            if (result_tagged[-1][1] == "VBN"
                    and test_tokens[i - 1][0].isupper()):
                result_tagged[-1] = (result_tagged[-1][0], "VBD")
            if len(result_tagged) >= 2:
                # Rule 5: If current tag is VB and previous word was tagged determiner, then change the current tag to NN i.e. singular noun
                if (result_tagged[-1][1] == "VB"
                        and (result_tagged[-2][1] == "DT")):
                    result_tagged[-1] = (result_tagged[-1][0], "NN")
                # Rule 6: If current tag is NN and previous word was tagged TO i.e. to, then change the current tag to VB i.e. verb
                if (result_tagged[-1][1] == "NN"
                        and (result_tagged[-2][1] == "TO")):
                    result_tagged[-1] = (result_tagged[-1][0], "VB")
                # Rule 7: If current tag is NN and previous word was tagged MD i.e. Modal, then change the current tag to VB i.e. verb
                if (result_tagged[-1][1] == "NN"
                        and (result_tagged[-2][1] == "MD")):
                    result_tagged[-1] = (result_tagged[-1][0], "VB")

    for word, tag in result_tagged:  # print all the result to STDOUT
        print("%s %s" % (word, tag))
示例#26
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # raise NotImplementedError('HMM.transition_model')
        # prepare the data
        train_data = [[('<s>', '<s>')] + sentence + [('</s>', '</s>')]
                      for sentence in train_data]
        tagGenerators = (((s[i][1], s[i + 1][1]) for i in range(len(s) - 1))
                         for s in train_data)
        data = itertools.chain.from_iterable(tagGenerators)

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        # for s in train_data:
        #     pass

        # compute the transition model

        transition_FD = ConditionalFreqDist(data)
        self.transition_PD = ConditionalProbDist(transition_FD,
                                                 self.lidstone_estimator)

        return self.transition_PD
示例#27
0
def create_word_bigram_scores(pos_corpus, neg_corpus):

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for corpus in pos_corpus:
        for word in corpus:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
    for corpus in neg_corpus:
        for word in corpus:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
示例#28
0
        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()

            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] += 1
                label_word_fd['pos'][word.lower()] += 1

            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] += 1
                label_word_fd['neg'][word.lower()] += 1

            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count

            word_scores = {}

            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(
                    label_word_fd['pos'][word], (freq, pos_word_count),
                    total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(
                    label_word_fd['neg'][word], (freq, neg_word_count),
                    total_word_count)
                word_scores[word] = pos_score + neg_score

            best = sorted(word_scores.iteritems(),
                          key=lambda (w, s): s,
                          reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
示例#29
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # Prepare the data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = [(tag, word.lower()) for pairs in train_data
                for (word, tag) in pairs]

        # Compute the emission model
        emission_FD = ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator)

        for tag, word in data:
            if tag not in self.states:
                self.states.append(tag)

        return self.emission_PD, self.states
示例#30
0
    def __init__(self, n, alpha=0.1, brown_categories=None):
        '''
        Initializes NgramBase with a list of conditional frequency distributions representing
        N-grams, (N-1)-grams, ...., bigrams, unigrams from the Brown corpus.
        '''
        self.n = n

        if brown_categories == None:
            brown_categories = brown.categories()
        samples = [[]] * n
        sents = self._get_sentences(brown_categories)
        for sent in sents:
            sent = [word.lower() for word in sent]
            if sent[-1].isalpha():
                sent += ['.']
            for index, m in enumerate(range(n, 0, -1)):
                igrams = ngrams(sent, m)
                igrams = [(igram[0:m - 1], igram[-1])
                          for igram in list(igrams)]
                samples[index] += igrams
        # list of N-grams with descending values of N
        self.grams = []
        for sample in samples:
            self.grams += [ConditionalFreqDist(sample)]

        # multiplier for each level of backoff
        self.alpha = alpha