def generate_chomsky(times=5, line_length=72): parts = [] for part in (leadins, subjects, verbs, objects): phraselist = list(map(str.strip, part.splitlines())) random.shuffle(phraselist) parts.append(phraselist) output = chain(*islice(izip(*parts), 0, times)) print(textwrap.fill(" ".join(output), line_length))
def accuracy(self, feature_sets): X, y = list(compat.izip(*feature_sets)) classifications = self.classify_many(X) count = 0 print 'Y LENGTH: ' + str(len(y)) print 'CLASSIFICATIONS LENGTH: ' + str(len(classifications)) for i in range(0, len(classifications)): if classifications[i] == y[i]: count += 1 accuracy = (count * 1.0) / len(y) return accuracy
def train(self, feature_sets): X, y = list(compat.izip(*feature_sets)) X = self.vectorizer.fit_transform(X) y = self.encoder.fit_transform(y) if self.thetas == None: self.thetas = sparse.rand(X.shape[1], 1, 1, 'csr') cost = self.compute_cost(X, y) print cost self.gradient_descent(X, y)
def test( self, test_sequence, verbose = False, **kwargs ): """ Tests the HiddenMarkovModelTagger instance. :param test_sequence: a sequence of labeled test instances :type test_sequence: list(list) :param verbose: boolean flag indicating whether training should be verbose or include printed output :type verbose: bool """ def words( sent ): return [ word for (word, tag) in sent ] def tags( sent ): return [ tag for (word, tag) in sent ] def flatten( seq ): return list( itertools.chain( *seq ) ) test_sequence = self._transform( test_sequence ) predicted_sequence = list( imap( self._tag, imap( words, test_sequence ) ) ) if verbose: for test_sent, predicted_sent in izip( test_sequence, predicted_sequence ): print( 'Test:', ' '.join( '%s/%s' % (token, tag) for (token, tag) in test_sent ) ) print( ) print( 'Untagged:', ' '.join( "%s" % token for (token, tag) in test_sent ) ) print( ) print( 'HMM-tagged:', ' '.join( '%s/%s' % (token, tag) for (token, tag) in predicted_sent ) ) print( ) print( 'Entropy:', self.entropy( [ (token, None) for (token, tag) in predicted_sent ] ) ) print( ) print( '-' * 60 ) test_tags = flatten( imap( tags, test_sequence ) ) predicted_tags = flatten( imap( tags, predicted_sequence ) ) acc = accuracy( test_tags, predicted_tags ) count = sum( len( sent ) for sent in test_sequence ) print( 'accuracy over %d tokens: %.2f' % (count, acc * 100) )
def train(self, labeled_featuresets): """ Train (fit) the scikit-learn estimator. :param labeled_featuresets: A list of ``(featureset, label)`` where each ``featureset`` is a dict mapping strings to either numbers, booleans or strings. """ X, y = list(compat.izip(*labeled_featuresets)) X = self._vectorizer.fit_transform(X) y = self._encoder.fit_transform(y) self._clf.fit(X, y) return self
def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the fraction of corresponding values that are equal. In particular, return the fraction of indices ``0<i<=len(test)`` such that ``test[i] == reference[i]``. :type reference: list :param reference: An ordered list of reference values. :type test: list :param test: A list of values to compare against the corresponding reference values. :raise ValueError: If ``reference`` and ``length`` do not have the same length. """ if len(reference) != len(test): raise ValueError("Lists must have the same length.") return sum(x == y for x, y in izip(reference, test)) / len(test)
def are_files_identical(filename1, filename2, debug=False): """ Compare two files, ignoring carriage returns. """ with open(filename1, "rb") as fileA: with open(filename2, "rb") as fileB: result = True for lineA, lineB in izip(sorted(fileA.readlines()), sorted(fileB.readlines())): if lineA.strip() != lineB.strip(): if debug: print("Error while comparing files. " + "First difference at line below.") print("=> Output file line: {0}".format(lineA)) print("=> Refer. file line: {0}".format(lineB)) result = False break return result
def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the fraction of corresponding values that are equal. In particular, return the fraction of indices ``0<i<=len(test)`` such that ``test[i] == reference[i]``. :type reference: list :param reference: An ordered list of reference values. :type test: list :param test: A list of values to compare against the corresponding reference values. :raise ValueError: If ``reference`` and ``length`` do not have the same length. """ if len(reference) != len(test): raise ValueError("Lists must have the same length.") return float(sum(x == y for x, y in izip(reference, test))) / len(test)
def log_likelihood(reference, test): """ Given a list of reference values and a corresponding list of test probability distributions, return the average log likelihood of the reference values, given the probability distributions. :param reference: A list of reference values :type reference: list :param test: A list of probability distributions over values to compare against the corresponding reference values. :type test: list(ProbDistI) """ if len(reference) != len(test): raise ValueError("Lists must have the same length.") # Return the average value of dist.logprob(val). total_likelihood = sum(dist.logprob(val) for (val, dist) in izip(reference, test)) return total_likelihood / len(reference)
def log_likelihood(reference, test): """ Given a list of reference values and a corresponding list of test probability distributions, return the average log likelihood of the reference values, given the probability distributions. :param reference: A list of reference values :type reference: list :param test: A list of probability distributions over values to compare against the corresponding reference values. :type test: list(ProbDistI) """ if len(reference) != len(test): raise ValueError("Lists must have the same length.") # Return the average value of dist.logprob(val). total_likelihood = sum( dist.logprob(val) for (val, dist) in izip(reference, test)) return total_likelihood / len(reference)
def train(self, labeled_featuresets): X, y = list(izip(*labeled_featuresets)) X = self._vectorizer.fit_transform(X) y = self._encoder.fit_transform(y) row, column = X.shape print 'row:'+str(row) print 'column:'+str(column) print y.shape print X print y with open('matrix', 'w') as f: for i in xrange(0, row): f.write(str(y[i]) + ' ') for j in xrange(0, column): f.write(str(int(X[i,j])) + ' ') f.write('\n') f.close() self._clf.fit(X, y) return self
def _tag(self, unlabeled_sequence): path = self._best_path(unlabeled_sequence) return list(izip(unlabeled_sequence, path))
def matrix_dimension(feature_set): vectorizer = DictVectorizer(dtype=float, sparse=True) X, _ = list(compat.izip(*feature_set)) X = vectorizer.fit_transform(X) return X.toarray().shape
affix_tagger = EntropyAffixTagger(train) unigram_tagger = EntropyUnigramTagger(train) taggers = [unigram_tagger, affix_tagger] tagger = EntropyVotingTagger(taggers, max_entropy=80) from nltk.tag import untag untagged_test = [untag(x) for x in dev] tagged_sents_uni_affix = unigram_affix_backoff.tag_sents(untagged_test) tagged_sents_entr = tagger.tag_sents(untagged_test) affix_mistake = 0 unigram_mistake = 0 overall_mistakes = 0 print "len of dev: ", len(dev) for tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent in izip(dev, tagged_sents_uni_affix, tagged_sents_entr): # import pdb;pdb.set_trace() for tagged_reference, tagged_uni_affix, tagged_entropy in izip(tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent): if tagged_uni_affix[1] != tagged_entropy[1]: overall_mistakes += 1 print "WE GOT MATCH!" print "Word = ", tagged_reference[0] print "real tag ", tagged_reference[1] print "backoff tag ", tagged_uni_affix[1] print "entropy tag ", tagged_entropy[1] for t in tagger._taggers: # import pdb # pdb.set_trace() print "Entropy for tagger ", t.__class__.__name__, " ", t.entropy(tagged_reference[0])