def generate_chomsky(times=5, line_length=72):
    parts = []
    for part in (leadins, subjects, verbs, objects):
        phraselist = list(map(str.strip, part.splitlines()))
    output = chain(*islice(izip(*parts), 0, times))
    print(textwrap.fill(" ".join(output), line_length))
 def accuracy(self, feature_sets):
     X, y = list(compat.izip(*feature_sets))
     classifications = self.classify_many(X)
     count = 0
     print 'Y LENGTH: ' + str(len(y))
     print 'CLASSIFICATIONS LENGTH: ' + str(len(classifications))
     for i in range(0, len(classifications)):
         if classifications[i] == y[i]:
             count += 1
     accuracy = (count * 1.0) / len(y)
     return accuracy
    def train(self, feature_sets):
        X, y = list(compat.izip(*feature_sets))
        X = self.vectorizer.fit_transform(X)
        y = self.encoder.fit_transform(y)

        if self.thetas == None:
            self.thetas = sparse.rand(X.shape[1], 1, 1, 'csr')

        cost = self.compute_cost(X, y)
        print cost

        self.gradient_descent(X, y)
    def test( self, test_sequence, verbose = False, **kwargs ):
        Tests the HiddenMarkovModelTagger instance.

        :param test_sequence: a sequence of labeled test instances
        :type test_sequence: list(list)
        :param verbose: boolean flag indicating whether training should be
            verbose or include printed output
        :type verbose: bool

        def words( sent ):
            return [ word for (word, tag) in sent ]

        def tags( sent ):
            return [ tag for (word, tag) in sent ]

        def flatten( seq ):
            return list( itertools.chain( *seq ) )

        test_sequence = self._transform( test_sequence )
        predicted_sequence = list( imap( self._tag, imap( words, test_sequence ) ) )

        if verbose:
            for test_sent, predicted_sent in izip( test_sequence, predicted_sequence ):
                print( 'Test:',
                       ' '.join( '%s/%s' % (token, tag)
                                 for (token, tag) in test_sent ) )
                print( )
                print( 'Untagged:',
                       ' '.join( "%s" % token for (token, tag) in test_sent ) )
                print( )
                print( 'HMM-tagged:',
                       ' '.join( '%s/%s' % (token, tag)
                                 for (token, tag) in predicted_sent ) )
                print( )
                print( 'Entropy:',
                       self.entropy( [ (token, None) for
                                       (token, tag) in predicted_sent ] ) )
                print( )
                print( '-' * 60 )

        test_tags = flatten( imap( tags, test_sequence ) )
        predicted_tags = flatten( imap( tags, predicted_sequence ) )

        acc = accuracy( test_tags, predicted_tags )
        count = sum( len( sent ) for sent in test_sequence )
        print( 'accuracy over %d tokens: %.2f' % (count, acc * 100) )
    def train(self, labeled_featuresets):
        Train (fit) the scikit-learn estimator.

        :param labeled_featuresets: A list of ``(featureset, label)``
            where each ``featureset`` is a dict mapping strings to either
            numbers, booleans or strings.

        X, y = list(compat.izip(*labeled_featuresets))
        X = self._vectorizer.fit_transform(X)
        y = self._encoder.fit_transform(y)
        self._clf.fit(X, y)

        return self
def accuracy(reference, test):
    Given a list of reference values and a corresponding list of test
    values, return the fraction of corresponding values that are
    equal.  In particular, return the fraction of indices
    ``0<i<=len(test)`` such that ``test[i] == reference[i]``.

    :type reference: list
    :param reference: An ordered list of reference values.
    :type test: list
    :param test: A list of values to compare against the corresponding
        reference values.
    :raise ValueError: If ``reference`` and ``length`` do not have the
        same length.
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    return sum(x == y for x, y in izip(reference, test)) / len(test)
def are_files_identical(filename1, filename2, debug=False):
    Compare two files, ignoring carriage returns.
    with open(filename1, "rb") as fileA:
        with open(filename2, "rb") as fileB:
            result = True
            for lineA, lineB in izip(sorted(fileA.readlines()),
                if lineA.strip() != lineB.strip():
                    if debug:
                        print("Error while comparing files. " +
                              "First difference at line below.")
                        print("=> Output file line: {0}".format(lineA))
                        print("=> Refer. file line: {0}".format(lineB))
                    result = False
            return result
def log_likelihood(reference, test):
    Given a list of reference values and a corresponding list of test
    probability distributions, return the average log likelihood of
    the reference values, given the probability distributions.

    :param reference: A list of reference values
    :type reference: list
    :param test: A list of probability distributions over values to
        compare against the corresponding reference values.
    :type test: list(ProbDistI)
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")

    # Return the average value of dist.logprob(val).
    total_likelihood = sum(dist.logprob(val) for (val, dist) in izip(reference, test))
    return total_likelihood / len(reference)
	def train(self, labeled_featuresets):
            X, y = list(izip(*labeled_featuresets))
            X = self._vectorizer.fit_transform(X)
            y = self._encoder.fit_transform(y)
            row, column = X.shape
            print 'row:'+str(row)
            print 'column:'+str(column)
            print y.shape
            print X
            print y
            with open('matrix', 'w') as f:
                for i in xrange(0, row):
                    f.write(str(y[i]) + ' ')         
                    for j in xrange(0, column):
                        f.write(str(int(X[i,j])) + ' ')


            self._clf.fit(X, y)
            return self
 def _tag(self, unlabeled_sequence):
     path = self._best_path(unlabeled_sequence)
     return list(izip(unlabeled_sequence, path))
def matrix_dimension(feature_set):
    vectorizer = DictVectorizer(dtype=float, sparse=True)
    X, _ = list(compat.izip(*feature_set))
    X = vectorizer.fit_transform(X)

    return X.toarray().shape
    affix_tagger = EntropyAffixTagger(train)
    unigram_tagger = EntropyUnigramTagger(train)
    taggers = [unigram_tagger, affix_tagger]
    tagger = EntropyVotingTagger(taggers, max_entropy=80)

    from nltk.tag import untag

    untagged_test = [untag(x) for x in dev]
    tagged_sents_uni_affix = unigram_affix_backoff.tag_sents(untagged_test)
    tagged_sents_entr = tagger.tag_sents(untagged_test)
    affix_mistake = 0
    unigram_mistake = 0
    overall_mistakes = 0
    print "len of dev: ", len(dev)
    for tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent in izip(dev, tagged_sents_uni_affix,
        # import pdb;pdb.set_trace()
        for tagged_reference, tagged_uni_affix, tagged_entropy in izip(tagged_reference_sent, tagged_uni_affix_sent,
            if tagged_uni_affix[1] != tagged_entropy[1]:
                overall_mistakes += 1

                print "WE GOT MATCH!"
                print "Word = ", tagged_reference[0]
                print "real tag ", tagged_reference[1]
                print "backoff tag ", tagged_uni_affix[1]
                print "entropy tag ", tagged_entropy[1]
                for t in tagger._taggers:
                    # import pdb
                    # pdb.set_trace()
                    print "Entropy for tagger ", t.__class__.__name__, " ", t.entropy(tagged_reference[0])