示例#1
0
def setup_module ():
    global weights_nb
    global alltags
    global allwords
    counters = most_common.get_tags(TRAIN_FILE)
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts =  most_common.get_class_counts(counters)
    weights_nb = naivebayes.learnNBWeights(counters,class_counts,allwords)
    alltags = preproc.getAllTags(TRAIN_FILE)
示例#2
0
def setup_module():
    global weights_nb
    global alltags
    global allwords
    counters = most_common.get_tags(TRAIN_FILE)
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts = most_common.get_class_counts(counters)
    weights_nb = naivebayes.learnNBWeights(counters, class_counts, allwords)
    alltags = preproc.getAllTags(TRAIN_FILE)
示例#3
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features
        """
    # compute naive bayes weights

    # convert nb weights to hmm weights
    counters = most_common.get_tags(trainfile)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts = most_common.get_class_counts(counters)
    nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords,
                                           0.001)

    trans_cnt = defaultdict(Counter)
    with open(trainfile) as instances:
        prev_tag = START_TAG
        for line in instances:
            if len(line.rstrip()) == 0:
                trans_cnt[prev_tag][END_TAG] += 1
                prev_tag = START_TAG
                continue

            parts = line.rstrip().split()
            if len(parts) > 1:
                cur_tag = parts[1]
            else:
                cur_tag = UNKNOWN

            trans_cnt[prev_tag][cur_tag] += 1
            prev_tag = cur_tag
        if prev_tag != START_TAG:
            trans_cnt[prev_tag][END_TAG] += 1

    hmm_weights = defaultdict(lambda: -1000.)
    for key in nb_weights:
        tag = key[0]
        word = key[1]
        hmm_weights[(tag, word, EMIT)] = nb_weights[key]

    for prev_tag in trans_cnt:
        cnt = trans_cnt[prev_tag]
        total_pairs = sum(cnt.values())
        for cur_tag in cnt:
            hmm_weights[(cur_tag, prev_tag,
                         TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs)

    return hmm_weights
示例#4
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features
        """
    # compute naive bayes weights
    
    # convert nb weights to hmm weights
    counters = most_common.get_tags(trainfile)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))        
    class_counts = most_common.get_class_counts(counters)
    nb_weights = naivebayes.learnNBWeights(counters,class_counts,allwords,0.001)
    
    trans_cnt = defaultdict(Counter)
    with open(trainfile) as instances:
        prev_tag = START_TAG
        for line in instances:
            if len(line.rstrip()) == 0:
                trans_cnt[prev_tag][END_TAG] += 1
                prev_tag = START_TAG
                continue
            
            parts = line.rstrip().split()
            if len(parts) >1:
                cur_tag = parts[1]
            else: 
                cur_tag = UNKNOWN
            
            trans_cnt[prev_tag][cur_tag] += 1
            prev_tag = cur_tag                                
        if prev_tag != START_TAG:
            trans_cnt[prev_tag][END_TAG] += 1
                   
    hmm_weights = defaultdict(lambda : -1000.)
    for key in nb_weights:
        tag = key[0]
        word = key[1]
        hmm_weights[(tag, word, EMIT)] = nb_weights[key]
        
    for prev_tag in trans_cnt:
        cnt = trans_cnt[prev_tag]
        total_pairs = sum(cnt.values())        
        for cur_tag in cnt:
            hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) 
    
    return hmm_weights
示例#5
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features

        ngrams("I really like", 2)
            (I, really)
            (really, like)
        (end_tag,'N',trans)  => q(stop/N) => (N, end_tag)
        q(stop/N) = count(stop, N) / count(N)
        """
    # compute naive bayes weights
    counters = most_common.get_tags(trainfile)
    class_counts = most_common.get_class_counts(counters)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))

    nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001)

    # convert nb weights to hmm weights
    hmm_weights = defaultdict(lambda: -1000.0)
    for (tag, word), weight in nb_weights.iteritems():
        hmm_weights[(tag, word, EMIT)] = weight

    unigram = preproc.getNgrams(trainfile)
    bigram = preproc.getNgrams(trainfile, 2)
    unigramCount = preproc.getAllCounts(unigram)
    bigramCount = preproc.getAllCounts(bigram)

    for (tag1, tag2) in bigramCount.keys():
        hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log(
            unigramCount.get(tag1, 0)
        )

    return hmm_weights