예제 #1
0
    def _train(cls, labeled_sequence, test_sequence=None,
                    unlabeled_sequence=None, transform=_identity,
                    estimator=None, **kwargs):

        if estimator is None:
            def estimator(fd, bins):
                return LidstoneProbDist(fd, 0.1, bins)

        labeled_sequence = LazyMap(transform, labeled_sequence)
        symbols = unique_list(word for sent in labeled_sequence
            for word, tag in sent)
        tag_set = unique_list(tag for sent in labeled_sequence
            for word, tag in sent)

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs,
                  hmm._priors, transform=transform)

        if test_sequence:
            hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        if unlabeled_sequence:
            max_iterations = kwargs.get('max_iterations', 5)
            hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm,
                max_iterations=max_iterations)
            if test_sequence:
                hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        return hmm
예제 #2
0
파일: hmm.py 프로젝트: FY-KHM/AI-Project
    def _train(cls, labeled_sequence, test_sequence=None,
                    unlabeled_sequence=None, transform=_identity,
                    estimator=None, **kwargs):

        if estimator is None:
            def estimator(fd, bins):
                return LidstoneProbDist(fd, 0.1, bins)

        labeled_sequence = LazyMap(transform, labeled_sequence)
        symbols = unique_list(word for sent in labeled_sequence
            for word, tag in sent)
        tag_set = unique_list(tag for sent in labeled_sequence
            for word, tag in sent)

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs,
                  hmm._priors, transform=transform)

        if test_sequence:
            hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        if unlabeled_sequence:
            max_iterations = kwargs.get('max_iterations', 5)
            hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm,
                max_iterations=max_iterations)
            if test_sequence:
                hmm.test(test_sequence, verbose=kwargs.get('verbose', False))

        return hmm
예제 #3
0
def train_unsupervised(labeled_sents, unlabeled_sents, max_iterations=3):
    symbols = unique_list(word for sent in labeled_sents for word, tag in sent)
    # Extend symbols with those in the unlabelled set
    symbols = unique_list(symbols + unique_list(word
                                                for sent in unlabeled_sents
                                                for word in sent))
    tag_set = unique_list(tag for sent in labeled_sents for word, tag in sent)

    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
    print("Supervised training for initialization ({} sentences)".format(
        len(labeled_sents)))
    hmm = trainer.train_supervised(
        labeled_sents,
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    # The unlabeled sentences are expected to have tags, which are ignored
    unlabeled_sents = [[(word, None) for word in sent]
                       for sent in unlabeled_sents]
    print(
        "Unsupervised training ({} sentences) for up to {} iterations".format(
            len(unlabeled_sents), max_iterations))
    hmm = trainer.train_unsupervised(unlabeled_sents,
                                     model=hmm,
                                     max_iterations=max_iterations,
                                     verbose=True)
    return hmm
예제 #4
0
파일: hmm.py 프로젝트: haadkhan/cerebri
 def __init__(self, symbols, states, transitions, outputs, priors, transform=_identity):
     self._symbols = unique_list(symbols)
     self._states = unique_list(states)
     self._transitions = transitions
     self._outputs = outputs
     self._priors = priors
     self._cache = None
     self._transform = transform
예제 #5
0
 def __init__(self, symbols, states, transitions, outputs, priors,
              transform=_identity):
     self._symbols = unique_list(symbols)
     self._states = unique_list(states)
     self._transitions = transitions
     self._outputs = outputs
     self._priors = priors
     self._cache = None
     self._transform = transform
예제 #6
0
파일: hmm.py 프로젝트: dario-wat/hmm-nlg
def hmmFactory(trainType, string):
	"""
	Creates hmm by it's type and the given corpus.

	:trainType type: str
	:string type: str
	:rtype: HmmWrapper
	"""

	if trainType == 'super':			# supervised
		
		logging.info('Supervised')
		
		logging.info('Filtering corpus...')		
		corpus = tag(string)
		states = unique_list(tag for sent in corpus for (_,tag) in sent)
		symbols = unique_list(word for sent in corpus for (word, _) in sent)
		
		logging.info('Training hmm...')
		trainer = HmmWrapper(states, symbols)
		trainer.trainSupervised(corpus)

		return trainer

	elif trainType == 'unsuper':		# unsupervised
		
		logging.info('Unsupervised')

		logging.info('Filtering corpus')
		corpus = tagEmpty(string)
		states = range(5)
		symbols = unique_list(word for sent in corpus for (word, _) in sent)
		
		logging.info('Training hmm...')
		trainer = HmmWrapper(states, symbols)
		trainer.trainUnsupervised(corpus)
		
		return trainer

	else:							# chunked supervised
		
		logging.info('Chunked supervised')

		logging.info('Training chunk parser...')
		chunker = ChunkWrapper()
		
		logging.info('Chunking corpus...')
		corpus = tagChunk(string, chunker)
		states = unique_list(tag for sent in corpus for (_, tag) in sent)
		symbols = unique_list(word for sent in corpus for (word, _) in sent)
		
		logging.info('Training hmm...')
		trainer = HmmWrapper(states, symbols)
		trainer.trainSupervised(corpus)

		return trainer
예제 #7
0
파일: views.py 프로젝트: bolvano/hmm-ap
def index(request):
    if request.method == "POST":
        if request.POST.get("tokens"):
            with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
                corpus = pickle.load(handle)

            tokens = ast.literal_eval(request.POST.get("tokens"))
            tagged = []
            i = 1
            for item in tokens:
                tagged.append((item,request.POST.get("token_"+str(i))))
                i += 1
            if tagged not in corpus:
                corpus.append(tagged)
                with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
                    pickle.dump(corpus, handle)
                tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
                symbols = unique_list(word for sent in corpus for (word,tag) in sent)
                trainer = HiddenMarkovModelTrainer(tag_set, symbols)
                hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
                with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
                    pickle.dump(hmm, handle)

            return render(request, 'tagger/index.html', {'corpus': corpus})

        else:
            if request.POST.get("random") == 'true':
                address = get_random_address()
                if not address:
                    return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'})

            else:
                address = request.POST.get("address")

            tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )

            if tokens:
                pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
                hmm = pickle.load(pkl_file)
                pkl_file.close()

                tagged = hmm.tag(tokens)

                tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
                reader = codecs.getreader("utf-8")
                tags = json.load(reader(tags_file))
                tags_file.close()

                return render(request, 'tagger/index.html', {'address': address,
                                                              'tokens': tokens,
                                                              'tagged': tagged,
                                                              'tags': sorted(tags.items(), key=operator.itemgetter(1)) })

    return render(request, 'tagger/index.html', {})
예제 #8
0
def bigramTags(a):
    #create lit of all tags
    tags = [x[1] for x in a]
    #create list of tag bigrams
    btags = [(tags[i], tags[i + 1]) for i in range(len(tags) - 1)]
    #create frequency distribution of bigram tags
    btagsf = FreqDist(btags)
    #create list of unique bigram tags
    btagscombo = [(x, y) for x in unique_list(tags) for y in unique_list(tags)]
    out = []
    #loop through unique bigram tags
    for i in range(len(btagscombo)):
        #add bigran tag with frequency probability to list
        out.append((btagscombo[i], btagsf.freq(btagscombo[i])))
    return out
예제 #9
0
def unigramtrainingset(a):
    # create frequency distribution of word, tag pairs in the training set
    fd = FreqDist(a)
    # seperate words from tags
    x = [y[0] for y in a]
    # create frequency distribution of words in the training set
    fd2 = FreqDist(x)
    # create list of unique words
    words = unique_list([x[0] for x in fd])
    # create list of unique tags (all possible tags)
    tags = [
        'NOUN', 'ADP', 'ADV', 'NUM', 'VERB', '.', 'PRON', 'DET', 'ADJ', 'PRT',
        'CONJ'
    ]
    # initialise output list
    out = []

    # loop through each unique word
    for word in words:
        # reinitialise tagso list
        tagso = []
        # store frequency of current word
        denom = fd2.freq(word)
        # loop through each tag
        for tag in tags:
            # compute probability of current tag being paired with current word
            prob = fd.freq((word, tag)) / denom
            # create list of tag, probability pairs
            tagso.append((tag, prob))
        # append word, tag-probabilities to out list
        out.append((word, tagso))
    return out
예제 #10
0
def train_hmm_word(sent, train_data):
  symbols = unique_list(word for sent in train_data
              for word, tag in sent)

  # List of tags: gives back ["B", "I", "O"]
  tag_set = unique_list(tag for sent in train_data
              for word, tag in sent)

  trainer = hmm.HiddenMarkovModelTrainer(tag_set)
  tagger = trainer.train_supervised(train_data)

  sent_lst = sent.split()
  i = 0
  test_tag = []
  while (i < len(sent_lst)):
    end = min(i+5, len(sent_lst))
    test_tag += tagger.tag(sent_lst[i:end])
    i += 5

  # print(test_tag)
  return test_tag
예제 #11
0
파일: views.py 프로젝트: bolvano/hmm-ap
def corpus(request):
    with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
        corpus = pickle.load(handle)

    if request.method == "POST":
        #import ipdb; ipdb.set_trace()
        if request.POST.get("tokens") and request.POST.get("true_index"):
            true_index = int(request.POST.get("true_index"))
            tokens = ast.literal_eval(request.POST.get("tokens"))
            tagged = []
            i = 1
            for item in tokens:
                tagged.append((item,request.POST.get("token_"+str(i))))
                i += 1
            del(corpus[true_index])
            corpus.append(tagged)
            with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
                pickle.dump(corpus, handle)
            tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
            symbols = unique_list(word for sent in corpus for (word,tag) in sent)
            trainer = HiddenMarkovModelTrainer(tag_set, symbols)
            hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
            with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
                pickle.dump(hmm, handle)
            return render(request, 'tagger/index.html', {'corpus': corpus})
        elif request.POST.get("index"):

            tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
            reader = codecs.getreader("utf-8")
            tags = json.load(reader(tags_file))
            true_index = len(corpus) - int(request.POST.get("index"))
            corpus_item = corpus[true_index]
            return render(request, 'tagger/corpus.html', {
                'tokens' : [t[0] for t in corpus_item],
                'corpus_item': corpus_item,
                'true_index': true_index,
                'tags': sorted(tags.items(), key=operator.itemgetter(1))})
    else:
        return render(request, 'tagger/corpus.html', {'corpus': corpus})
예제 #12
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        def lidstone_est(f):
            return nltk.probability.LidstoneProbDist(f, 0.01, f.B() + 1)

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = [(tag, word.lower())
                for (word,
                     tag) in list(itertools.chain.from_iterable(train_data))]
        emission_FD = nltk.probability.ConditionalFreqDist(data)
        self.emission_PD = nltk.probability.ConditionalProbDist(
            emission_FD, lidstone_est)
        self.states = unique_list(tag for sent in train_data
                                  for (word, tag) in sent)

        return self.emission_PD, self.states
예제 #13
0
import nltk
cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
print(len(cor))
from nltk.util import unique_list
tag_set = unique_list(tag for sent in cor for (word, tag) in sent)
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word, tag) in sent)
print(len(symbols))
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word, tag) in sent)
print(len(symbols))

trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
print(tag_set)
print(symbols)
train_corpus = []
test_corpus = []
for i in range(len(cor)):
    if i % 10:
        train_corpus += [cor[i]]
    else:
        test_corpus += [cor[i]]
print(len(train_corpus))
print(len(test_corpus))
예제 #14
0
파일: views.py 프로젝트: bolvano/hmm-ap
def test(request):

    if request.method == "POST":
        session_id = request.session['session_id']
        order = request.session['order']

        from_corpus = request.session['from_corpus']
        from_raw = request.session['from_raw']

        tagged_valid = True
        i = 1
        for item in from_corpus:
            if order == 1:
                tag = request.POST.get("token_first_"+str(i))
            else:
                tag = request.POST.get("token_second_"+str(i))
            if tag != from_corpus[i-1][1]:
                tagged_valid = False
            i += 1

        if tagged_valid:

            with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
                corpus = pickle.load(handle)

            #import ipdb; ipdb.set_trace();

            new_tagged = []
            i = 1
            for item in from_raw:
                if order == 1:
                    new_tagged.append((item[0],request.POST.get("token_second_"+str(i))))
                else:
                    new_tagged.append((item[0],request.POST.get("token_first_"+str(i))))
                i += 1
            if new_tagged not in corpus:
                corpus.append(new_tagged)
                with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
                    pickle.dump(corpus, handle)
                tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
                symbols = unique_list(word for sent in corpus for (word,tag) in sent)
                trainer = HiddenMarkovModelTrainer(tag_set, symbols)
                hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
                with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
                    pickle.dump(hmm, handle)

            return render(request, 'tagger/index.html', {'corpus': corpus})


    request.session['order'] = order = int(random.getrandbits(1))

    with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
        corpus = pickle.load(handle)
        from_corpus = corpus.pop(random.randrange(len(corpus)))

    from_corpus_str = ' '.join([t[0] for t in from_corpus])
    from_corpus_str = from_corpus_str.replace(' , ',', ')

    from_raw_str = get_random_address()
    if not from_raw_str:
        from_raw_str = corpus.pop(random.randrange(len(corpus)))

    pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
    hmm = pickle.load(pkl_file)
    pkl_file.close()

    tokens = regexp_tokenize(from_raw_str, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )

    from_raw = hmm.tag(tokens)

    tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
    reader = codecs.getreader("utf-8")
    tags = json.load(reader(tags_file))
    tags_file.close()

    session_id = uuid.uuid1()
    request.session['session_id'] = str(session_id)

    request.session['from_corpus'] = from_corpus
    request.session['from_raw'] = from_raw

    from_corpus_distorted = distort(from_corpus)

    data = [from_corpus_distorted, from_corpus_str, from_raw, from_raw_str]

    return render(request, 'tagger/test.html', {

        'data': data,

        'session_id': session_id,
        'order': order,

        'tags': sorted(tags.items(), key=operator.itemgetter(1)) })
예제 #15
0
import nltk
cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
print(len(cor))
from nltk.util import unique_list
tag_set = unique_list(tag for sent in cor for (word,tag) in sent)
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word,tag) in sent)
print(len(symbols))
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word,tag) in sent)
print(len(symbols))
trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
train_corpus = []
test_corpus = []
for i in range(len(cor)):
    if i % 10:
        train_corpus+=[cor[i]]
    else:
        test_corpus+=[cor[i]]
print(len(train_corpus))
print(len(test_corpus))