예제 #1
0
파일: hmm.py 프로젝트: iliakur/NLP
def hmm(file):
    """Given an open FILE, e.g. from the open(filename) function,
    Read pre-tagged sentences of WSJ, one per line.  Return an HMM,
    here represented as a tuple containing (1) the transition probabilities,
    and (2) the emmission probabilities."""
    transitions = DefaultDict(DefaultDict(0))
    emissions = DefaultDict(DefaultDict(0))
    wordcounts = DefaultDict(0)
    # For each sentence (one per line)
    for line in file.xreadlines():
        # for each word in the sentence (space separated)
        prevtag = 'START'  # Before each sentence, begin in START state
        for taggedword in line.split():
            (word, tag) = taggedword.split('/')
            transitions[prevtag][tag] += 1
            emissions[tag][word] += 1
            wordcounts[word] += 1
    # At test time we will need estimates for "unknown words"---the words
    # the words that never occurred in the training data.  One recommended
    # way to do this is to turn all training words occurring just once
    # into '<UNKNOWN>' and use this as the stand-in for all "unknown words"
    # at test time.  Below we make all the necessary transformations
    # to '<UNKNOWN>'.
    for tag, dict in emissions.items():
        for word, count in dict.items():
            if wordcounts[word] == 1:
                del emissions[tag][word]
                emissions[tag]['<UNKNOWN>'] += 1
    # Here you need to add code that will turn these dictionaries
    # of counts into dictionaries of smoothed conditional probabilities
    return (transitions, emissions)
예제 #2
0
def bigrams(words):
    """Given an array of words, returns a dictionary of dictionaries,
    containing occurrence counts of bigrams."""
    d = DefaultDict(DefaultDict(0))
    for (w1, w2) in zip([None] + words, words + [None]):
        d[w1][w2] += 1
    return d
예제 #3
0
def parse(sentence):
    global grammar
    # Create the table; index j for rows, i for columns
    length = len(sentence)
    score = [None] * (length)
    prob_table = DefaultDict(float)
    trace = {}
    list2=[]

    for j in range(length):
        score[j] = [None] * (length+1)
        for i in range(length+1):
            score[j][i] = []
            
    # Fill the diagonal of the table with the parts-of-speech of the words
    for k in range(1,length+1):
        results = producers(sentence[k-1])
        for item in results:
            try:
                prob = grammar[item][sentence[k-1],]
            except:
                prob = grammar[item]['<unk>',]
            prob_table[k-1,k, item] = prob
        score[k-1][k].extend(results)

    #Weighted CYK
    for width in range(2,length+1): 
        for start in range(0,length+1-width): 
            end = start + width 
            for mid in range (start, end): 
                args = None
                for x in score[start][mid]: 
                    for y in score[mid][end]:
                        results = producers((x,y))
                        for item in results:
                            prob1 = grammar[item][(x,y)]
                            prob2 = prob1 + prob_table[start, mid, x] + prob_table[mid, end, y]
                            check = start, end, item
                            if check in prob_table:
                                if prob2 > prob_table[start, end, item]:
                                    prob_table[start, end, item] = prob2
                            else:
                                prob_table[start, end, item] = prob2
                            args2 = x, y, mid
                            if check in trace:
                                if prob2 >= prob_table[start, end, item]:
                                    args = x, y, mid
                                    trace[start, end, item] = args
                            else:
                                args = x, y, mid
                                trace[start, end, item] = args
                            if item not in score[start][end]:
                                score[start][end].append(item)

    
    try:
        if prob_table[0, length, 'TOP']:
            return  get_tree(sentence, trace, 0, length, 'TOP')
    except:
        print "",
예제 #4
0
def files2countdict(files):
    """Given an array of filenames, return a dictionary with keys
    being the space-separated, lower-cased words, and the values being
    the number of times that word occurred in the files."""
    d = DefaultDict(0)
    for file in files:
        for word in open(file).read().split():
            d[word.lower()] += 1
    return d
예제 #5
0
def train_maxent (dirs):
    """Train and return a MaxEnt classifier.  
    The datastructure returned is dictionary whose keys are
    ('classname','word') tuples.  The values in the dictionary are
    the parameters (lambda weights) of the classifier.
    Note that this method does not return the list of classnames, 
    but the caller has those available already, since it is exactly the
    'dirs' argument.  

    If you need to recover the classnames from the diciontary itself, 
    you'd need to do something like:
    maxent = train_maxent(dirs)
    classes = list(set([c for (c,v) in maxent.keys()]))

    Some typical usage:
    dirs = ['spam','ham'] # where these are sub-directories of the CWD
    maxent = train_maxent(dirs)
    # interested in seeing the weight of "nigerian" in the "spam" class?
    lambda_spam_nigerian = maxent[('spam','nigerian')]
    # to classify a document
    scores = classify(maxent,dirs,"spam/file123")
    """
    classes = dirs
    maxent = DefaultDict(0)
    # Gather the "constraints" and initialize all-zero maxent dictionary
    constraints = DefaultDict(0)
    for cls in classes:
	maxent[(cls,'DEFAULT')] = 0
	print cls
	for file in glob.glob(cls+"/*"):
	    for word in open(file).read().split():
		word = word.lower()
		constraints[(cls,word)] += 1
		for clss in classes:
		    maxent[(clss,word)] = 0
    # Remember the maxent features, and get the starting point for optimization
    features = maxent.keys()
    lambda0 = maxent.values()
    # Here call an optimizer to find the best lambdas
    lambdaopt = optimize.fminNCG(value, lambda0, gradient, args=(features,dirs), printmessg=1)
    # Put the final optimal parameters are in returned dictionary
    assert maxent.keys() == features # Make sure the keys have not changed order
    maxent2 = dict([(k,v) for (k,v) in zip(maxent.keys(),lambdaopt)])
    return maxent2
예제 #6
0
 def __init__(self,
              positive,
              negative,
              neutral,
              pos,
              start=0,
              finish=None,
              weight=0.2):
     self.positive = positive
     self.negative = negative
     self.neutral = neutral
     self.pos = pos
     self.weight = weight
     self.s = {}
     self.s0 = {}
     self.initialize_s()
     self.lemmas = sorted(self.s.keys())
     self.lemma_count = len(self.lemmas)
     self.start = start
     self.finish = finish
     if self.finish == None or self.finish > self.lemma_count:
         self.finish = self.lemma_count
     self.a = DefaultDict(DefaultDict(0.0))
     self.initialize_a()
예제 #7
0
파일: search.py 프로젝트: maslab-ufrgs/ivc
    def search(self, net, origin, destination, accept_single_edge=False):
        """Performs a search from origin to destination.
        """
        # Initialize necessary structures/data
        self.__priority_queue = PriorityDict()
        self.__edges = DefaultDict(lambda id: EdgeData(net.getEdge(id)))

        self.__destination = self.__edges[destination.getID()]

        first = self.__edges[origin.getID()]
        first.previous_edge = None
        first.heuristic_cost = self.heuristic_cost(origin)

        if origin.getID() == destination.getID():
            # If the origin and destination are the same,
            # they must still be changed from the original
            # (in this case meaningless) zero-cost
            first.state = EdgeData.UNVISITED
        else:
            first.state = EdgeData.OPEN

        if accept_single_edge:
            # Insert the first edge into the queue
            self.__priority_queue[first.getID()] = first.estimated_cost
        else:
            # Insert the neighbors of the first edge into the queue
            self.__visit_neighbors_of(first)

        # Main search body
        found_result = self.__search()

        # Reconstruct the result, if found
        if found_result:
            return self.__destination.reconstruct_path()
        else:
            return None
예제 #8
0
import sys
import time
import threading
import Pyro4
import commands
from dicts import DefaultDict
from lock import Lock
import operator

Pyro4.config.SERIALIZERS_ACCEPTED.add(
    'pickle')  #pickle serializer for data transmission
Pyro4.config.SERVERTYPE = "thread"  #prespawned pool of thread server
Pyro4.config.THREADPOOL_SIZE = 10  #number of threads spawned.

clients = DefaultDict(DefaultDict(
    0))  #clients[clientID][event] #dictionary to store registered clients.
cache_scores = DefaultDict(DefaultDict(
    -1))  #score[team][event] #dictionary to store scores for a given event.
cache_medals = DefaultDict(
    DefaultDict(-1)
)  #medals[team][medalTypes] #dictionary to store medal count for a team.

rwl = Lock()

global database  #database server object
idNum = 1  #id of the server
clientRegistry = {}  #store the registered clients
global cacophonix

push = 0  #decide whether to use pull or push based cache
pull = 0
def bigrams(words):
    d = DefaultDict(DefaultDict(0))
    for (w1, w2) in zip([None] + words, words + [None]):
        d[w1][w2] += 1
    return d
예제 #10
0
#!/usr/bin/python3
#Jeannelle Alford
#jkalfor2

#A language ID program using a letter bigram model

from dicts import DefaultDict
import re
import math

file = open("LangId.train.English", "r")
data = file.read()
file.close()
enDictUni = DefaultDict(0)
enDictBi = DefaultDict(DefaultDict(1))
enDictUni[data[0]] += 1
for i in range(1, len(data)):
    enDictUni[data[i]] += 1
    enDictBi[data[i-1]][data[i]] += 1

file = open("LangId.train.French", "r")
data = file.read()
file.close()
frDictUni = DefaultDict(0)
frDictBi = DefaultDict(DefaultDict(1))
frDictUni[data[0]] += 1
for i in range(1, len(data)):
    frDictUni[data[i]] += 1
    frDictBi[data[i-1]][data[i]] += 1

file = open("LangId.train.Italian", "r")
예제 #11
0
#generates db.p database file. This is the pickle file.

import threading
import Pyro4
import commands
from dicts import DefaultDict
from lock import Lock
import time
import pickle

Pyro4.config.SERIALIZERS_ACCEPTED.add(
    'pickle')  #pickle serializer for data transmission
Pyro4.config.SERVERTYPE = "thread"  #prespawned pool of thread server
Pyro4.config.THREADPOOL_SIZE = 100  #number of threads spawned.

scores = DefaultDict(DefaultDict(
    0))  #score[team][event] #dictionary to store scores for a given event.
medals = DefaultDict(DefaultDict(
    0))  #medals[team][medalTypes] #dictionary to store medal count for a team.
medalTime = DefaultDict(0)  #time stamp corresponding to medal tally

rwl = Lock()
idNum = 0

databaseFile = None


class dataUpdate(object):
    def __init__(self):

        self.events = ['skating', 'curling', 'snowboard']  #events list
        self.teams = ['Gauls', 'Romans']  #teams list