Пример #1
0
def collect_counts(data, lex):
    tag_pair_counts = [Counter() for l in range(L)]
    tag_word_pair_counts = [Counter() for m in range(M)]

    sent = data

    def get_word(pos):
        if pos < 0 or pos >= len(sent):
            word = "<S>"
        else:
            word = sent[pos][0]
        return lex["words"][word]

    def get_tag(pos):
        if pos < 0 or pos >= len(sent):
            tag = "<T>"
        else:
            tag = sent[pos][1]
        return lex["tags"][tag]

    for i in range(len(sent)):
        for l in range(L):
            tag_pair_counts[l][get_tag(i - l - 1), get_tag(i)] += 1

        start = i - ((M - 1) / 2)
        for m in range(M):
            tag_word_pair_counts[m][get_tag(i), get_word(start + m)] += 1

    return {
        "tag": tag_pair_counts,
        "word": tag_word_pair_counts,
        "total": len(sent)
    }
Пример #2
0
 def test_copying(self):
     # Check that counters are copyable, deepcopyable, picklable, and
     #have a repr/eval round-trip
     words = Counter('which witch had which witches wrist watch'.split())
     update_test = Counter()
     update_test.update(words)
     for i, dup in enumerate([
             words.copy(),
             copy.copy(words),
             copy.deepcopy(words),
             pickle.loads(pickle.dumps(words, 0)),
             pickle.loads(pickle.dumps(words, 1)),
             pickle.loads(pickle.dumps(words, 2)),
             pickle.loads(pickle.dumps(words, -1)),
             cPickle.loads(cPickle.dumps(words, 0)),
             cPickle.loads(cPickle.dumps(words, 1)),
             cPickle.loads(cPickle.dumps(words, 2)),
             cPickle.loads(cPickle.dumps(words, -1)),
             eval(repr(words)),
             update_test,
             Counter(words),
     ]):
         msg = (i, dup, words)
         self.assertTrue(dup is not words)
         self.assertEqual(dup, words)
         self.assertEqual(len(dup), len(words))
         self.assertEqual(type(dup), type(words))
Пример #3
0
 def test_conversions(self):
     # Convert to: set, list, dict
     s = 'she sells sea shells by the sea shore'
     self.assertEqual(sorted(Counter(s).elements()), sorted(s))
     self.assertEqual(sorted(Counter(s)), sorted(set(s)))
     self.assertEqual(dict(Counter(s)), dict(Counter(s).items()))
     self.assertEqual(set(Counter(s)), set(s))
Пример #4
0
    def results(self):

        if self._error != None:
            return {"error": self._error}

        values = self._data.get()

        if len(values[0, :, 0]) < 3:
            return {"error": ProcessingError.JOB_TOO_SHORT}

        rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0])

        if len(self._hostdata) > 64:

            # Compute min, max & median data and only save the host data
            # for these hosts

            sortarr = numpy.argsort(rates.T, axis=1)

            retdata = {
                "min": self.collatedata(sortarr[:, 0], rates),
                "max": self.collatedata(sortarr[:, -1], rates),
                "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates),
                "times": values[0, 1:, 0].tolist(),
                "hosts": {}
            }

            uniqhosts = Counter(sortarr[:, 0])
            uniqhosts.update(sortarr[:, -1])
            uniqhosts.update(sortarr[:, sortarr.shape[1] / 2])
            includelist = uniqhosts.keys()
        else:
            # Save data for all hosts
            retdata = {
                "times": values[0, 1:, 0].tolist(),
                "hosts": {}
            }
            includelist = self._hostdata.keys()


        for hostidx in includelist:
            retdata['hosts'][str(hostidx)] = {}
            retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()
            retdata['hosts'][str(hostidx)]['dev'] = {}

            for devid in self._hostdevnames[hostidx].iterkeys():
                dpnts = len(values[hostidx, :, 0])
                retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist()

            retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]

        return retdata
Пример #5
0
 def test_subtract(self):
     c = Counter(a=-5, b=0, c=5, d=10, e=15, g=40)
     c.subtract(a=1, b=2, c=-3, d=10, e=20, f=30, h=-50)
     self.assertEqual(
         c, Counter(a=-6, b=-2, c=8, d=0, e=-5, f=-30, g=40, h=50))
     c = Counter(a=-5, b=0, c=5, d=10, e=15, g=40)
     c.subtract(Counter(a=1, b=2, c=-3, d=10, e=20, f=30, h=-50))
     self.assertEqual(
         c, Counter(a=-6, b=-2, c=8, d=0, e=-5, f=-30, g=40, h=50))
     c = Counter('aaabbcd')
     c.subtract('aaaabbcce')
     self.assertEqual(c, Counter(a=-1, b=0, c=-1, d=1, e=-1))
Пример #6
0
    def results(self):

        values = self._data.get()

        if len(self._hostdata) > 64:

            # Compute min, max & median data and only save the host data
            # for these hosts

            memdata = values[:, :, 1]
            sortarr = numpy.argsort(memdata.T, axis=1)

            retdata = {
                "min": self.collatedata(sortarr[:, 0], memdata),
                "max": self.collatedata(sortarr[:, -1], memdata),
                "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata),
                "times": values[0, :, 0].tolist(),
                "hosts": {}
            }

            uniqhosts = Counter(sortarr[:, 0])
            uniqhosts.update(sortarr[:, -1])
            uniqhosts.update(sortarr[:, sortarr.shape[1] / 2])
            includelist = uniqhosts.keys()
        else:
            # Save data for all hosts
            retdata = {
                "times": values[0, :, 0].tolist(),
                "hosts": {}
            }
            includelist = self._hostdata.keys()


        for hostidx in includelist:
            retdata['hosts'][str(hostidx)] = {}
            retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist()
            retdata['hosts'][str(hostidx)]['dev'] = {}

            for devid in self._hostdevnames[hostidx].iterkeys():
                dpnts = len(values[hostidx, :, 0])
                retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, int(devid)].tolist()

            retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]

        return retdata
Пример #7
0
def write_pseudo_ngrams(in_file, out_file, K):
    "Construct the counts for the p-ngram model."
    ngrams = Counter()
    with open(out_file, 'w') as f:
        for l in open(in_file):
            words = l.strip().split()
            words = ["<s>"] * K + words + ["</s>"] * (K + 1)
            for i in range(K, len(words) - K):
                total = []
                for k in range(1, K + 1):
                    total.append(words[i - k])
                    total.append(words[i + k])
                total.reverse()
                total = total + [words[i]]
                for j in range(len(total)):
                    ngrams[tuple(total[j:])] += 1
        for l in ngrams:
            print >> f, " ".join(l), ngrams[l]
Пример #8
0
def make_moments(lintext, vocounts, vocab, out_file, K, N):

    paircounts = [Counter() for k in range(K)]
    for (i, wrd) in enumerate(lintext[:len(lintext) - K]):
        for k in range(1, K + 1):
            paircounts[k - 1][(wrd, lintext[i + k])] += 1

    #print moments
    with open(out_file, 'w') as f:
        print >> f, N, K + 1

        for _ in range(K + 1):
            print >> f, len(vocab)

        for k in range(K):
            print >> f, len(paircounts[k].keys())
            for (a, b), count in paircounts[k].most_common():
                print >> f, vocab[a], vocab[b], count
Пример #9
0
    def get_minmaxmed_data(self, j, indices, settings):

        d = numpy.zeros((len(j.hosts), len(self.times) - 1))

        for hostidx, host in enumerate(j.hosts.itervalues()):
            data = self.getvalues(settings['metric'], settings['formula'],
                                  settings['interfaces'], j, host, indices)
            if data is None:
                return None
            d[hostidx, :] = data

        sortarr = numpy.argsort(d, axis=0)

        results = {
            "min": self.collatedata(sortarr[0, :], d),
            "max": self.collatedata(sortarr[-1, :], d),
            "med": self.collatedata(sortarr[sortarr.shape[0] / 2, :], d),
            "times": self.times[1:].tolist(),
            "hosts": {}
        }

        # Ensure head node is always in list
        uniqhosts = Counter([0])
        uniqhosts.update(sortarr[0, :])
        uniqhosts.update(sortarr[-1, :])
        uniqhosts.update(sortarr[sortarr.shape[0] / 2, :])

        for hostidx, host in enumerate(j.hosts.itervalues()):
            if hostidx not in uniqhosts.keys():
                continue

            results['hosts'][str(hostidx)] = {}
            if ('devicebased' in settings) and (settings['devicebased']
                                                == True):
                devicedata, devnames = self.getdevicevalues(
                    settings['metric'], settings['formula'],
                    settings['interfaces'], j, host, indices)
                results['hosts'][str(hostidx)]["dev"] = devicedata
                results['hosts'][str(hostidx)]["names"] = devnames

            results['hosts'][str(hostidx)]["all"] = d[hostidx].tolist()

        return results
Пример #10
0
    def results(self):

        if len(self._hostdata) != self._job.nodecount:
            return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE}

        for hcount in self._hostcounts.itervalues():
            if hcount['missing'] > hcount['present']:
                return {'error': ProcessingError.CPUSET_UNKNOWN}

        values = self._data.get()

        if len(self._hostdata) > 64:

            # Compute min, max & median data and only save the host data
            # for these hosts

            memdata = values[:, :, 1]
            sortarr = numpy.argsort(memdata.T, axis=1)

            retdata = {
                "min": self.collatedata(sortarr[:, 0], memdata),
                "max": self.collatedata(sortarr[:, -1], memdata),
                "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2],
                                        memdata),
                "times": values[0, :, 0].tolist(),
                "hosts": {}
            }

            uniqhosts = Counter(sortarr[:, 0])
            uniqhosts.update(sortarr[:, -1])
            uniqhosts.update(sortarr[:, sortarr.shape[1] / 2])
            includelist = uniqhosts.keys()
        else:
            # Save data for all hosts
            retdata = {"times": values[0, :, 0].tolist(), "hosts": {}}
            includelist = self._hostdata.keys()

        for hostidx in includelist:
            retdata['hosts'][str(hostidx)] = {}
            retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :,
                                                           1].tolist()

        return retdata
Пример #11
0
    def results(self):

        if len(self._hostdata) != self._job.nodecount:
            return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}

        values = self._data.get()

        if len(values[0, :, 0]) < 3:
            return {"error": ProcessingError.JOB_TOO_SHORT}

        rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0])

        if len(self._hostdata) > 64:

            # Compute min, max & median data and only save the host data
            # for these hosts

            sortarr = numpy.argsort(rates.T, axis=1)

            retdata = {
                "min": self.collatedata(sortarr[:, 0], rates),
                "max": self.collatedata(sortarr[:, -1], rates),
                "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2],
                                        rates),
                "times": values[0, 1:, 0].tolist(),
                "hosts": {}
            }

            uniqhosts = Counter(sortarr[:, 0])
            uniqhosts.update(sortarr[:, -1])
            uniqhosts.update(sortarr[:, sortarr.shape[1] / 2])
            includelist = uniqhosts.keys()
        else:
            # Save data for all hosts
            retdata = {"times": values[0, 1:, 0].tolist(), "hosts": {}}
            includelist = self._hostdata.keys()

        for hostidx in includelist:
            retdata['hosts'][str(hostidx)] = {}
            retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()

        return retdata
Пример #12
0
def make_vocab(data, val_data, test_data):
    train_counts = Counter()
    word_set = set()
    tag_set = set()

    train_counts.update((word for word, _ in data))

    all_words = data + val_data + test_data
    word_set.update((word for word, _ in all_words))
    tag_set.update((tag for _, tag in all_words))

    word_dict = {"<S>": 0}
    tag_dict = {"<T>": 0}

    for word in word_set:
        word_dict.setdefault(word, len(word_dict))

    for tag in tag_set:
        tag_dict.setdefault(tag, len(tag_dict))

    return {"tags": tag_dict, "words": word_dict, "word_counts": train_counts}
Пример #13
0
def main(arguments):

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-K', '--K', help="Window size.", type=int)
    parser.add_argument('-t', '--train', help="Training file.", type=str)
    parser.add_argument('-v', '--valid', help="Validation file", type=str)

    args = parser.parse_args(arguments)

    vocab_file = args.train + '_vocab_K' + str(args.K) + '.dat'
    train_moments_file = args.train + '_moments_K' + str(args.K) + '.dat'
    valid_moments_file = args.valid + '_moments_K' + str(args.K) + '.dat'
    train_text_file = args.train + '_text_K' + str(args.K) + '.dat'
    valid_text_file = args.valid + '_text_K' + str(args.K) + '.dat'
    pngram_file = args.train + '_pngram_K' + str(args.K) + '.dat'

    K = args.K
    txt, lintext, N = get_text(args.train, args.K)
    valid_txt, valid_lintext, valN = get_text(args.valid, args.K)

    vocounts = Counter()
    vocounts['<unk>'] = 0

    vocounts.update(lintext)

    vocab = dict([(w, i) for i, (w, _) in enumerate(vocounts.most_common())])

    with open(vocab_file, 'w') as f:
        for i, (w, c) in enumerate(vocounts.most_common()):
            print >> f, i, w, c

    make_moments(lintext, vocounts, vocab, train_moments_file, args.K, N)
    make_moments(valid_lintext, vocounts, vocab, valid_moments_file, args.K,
                 valN)
    write_words(txt, train_text_file, vocab, args.K)
    write_words(valid_txt, valid_text_file, vocab, args.K)
    write_pseudo_ngrams(args.train, pngram_file, args.K)
Пример #14
0
    def test_multiset_operations(self):
        # Verify that adding a zero counter will strip zeros and negatives
        c = Counter(a=10, b=-2, c=0) + Counter()
        self.assertEqual(dict(c), dict(a=10))

        elements = 'abcd'
        for i in range(1000):
            # test random pairs of multisets
            p = Counter(dict((elem, randrange(-2, 4)) for elem in elements))
            p.update(e=1, f=-1, g=0)
            q = Counter(dict((elem, randrange(-2, 4)) for elem in elements))
            q.update(h=1, i=-1, j=0)
            for counterop, numberop in [
                (Counter.__add__, lambda x, y: max(0, x + y)),
                (Counter.__sub__, lambda x, y: max(0, x - y)),
                (Counter.__or__, lambda x, y: max(0, x, y)),
                (Counter.__and__, lambda x, y: max(0, min(x, y))),
            ]:
                result = counterop(p, q)
                for x in elements:
                    self.assertEqual(numberop(p[x], q[x]), result[x],
                                     (counterop, x, p, q))
                # verify that results exclude non-positive counts
                self.assertTrue(x > 0 for x in result.values())

        elements = 'abcdef'
        for i in range(100):
            # verify that random multisets with no repeats are exactly like sets
            p = Counter(dict((elem, randrange(0, 2)) for elem in elements))
            q = Counter(dict((elem, randrange(0, 2)) for elem in elements))
            for counterop, setop in [
                (Counter.__sub__, set.__sub__),
                (Counter.__or__, set.__or__),
                (Counter.__and__, set.__and__),
            ]:
                counter_result = counterop(p, q)
                set_result = setop(set(p.elements()), set(q.elements()))
                self.assertEqual(counter_result, dict.fromkeys(set_result, 1))
Пример #15
0
 def test_basics(self):
     c = Counter('abcaba')
     self.assertEqual(c, Counter({'a': 3, 'b': 2, 'c': 1}))
     self.assertEqual(c, Counter(a=3, b=2, c=1))
     self.assertIsInstance(c, dict)
     self.assertIsInstance(c, Mapping)
     self.assertTrue(issubclass(Counter, dict))
     self.assertTrue(issubclass(Counter, Mapping))
     self.assertEqual(len(c), 3)
     self.assertEqual(sum(c.values()), 6)
     self.assertEqual(sorted(c.values()), [1, 2, 3])
     self.assertEqual(sorted(c.keys()), ['a', 'b', 'c'])
     self.assertEqual(sorted(c), ['a', 'b', 'c'])
     self.assertEqual(sorted(c.items()), [('a', 3), ('b', 2), ('c', 1)])
     self.assertEqual(c['b'], 2)
     self.assertEqual(c['z'], 0)
     with test_support.check_py3k_warnings():
         self.assertEqual(c.has_key('c'), True)
         self.assertEqual(c.has_key('z'), False)
     self.assertEqual(c.__contains__('c'), True)
     self.assertEqual(c.__contains__('z'), False)
     self.assertEqual(c.get('b', 10), 2)
     self.assertEqual(c.get('z', 10), 10)
     self.assertEqual(c, dict(a=3, b=2, c=1))
     self.assertEqual(repr(c), "Counter({'a': 3, 'b': 2, 'c': 1})")
     self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)])
     for i in range(5):
         self.assertEqual(c.most_common(i), [('a', 3), ('b', 2),
                                             ('c', 1)][:i])
     self.assertEqual(''.join(sorted(c.elements())), 'aaabbc')
     c['a'] += 1  # increment an existing value
     c['b'] -= 2  # sub existing value to zero
     del c['c']  # remove an entry
     del c['c']  # make sure that del doesn't raise KeyError
     c['d'] -= 2  # sub from a missing value
     c['e'] = -5  # directly assign a missing value
     c['f'] += 4  # add to a missing value
     self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4))
     self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff')
     self.assertEqual(c.pop('f'), 4)
     self.assertNotIn('f', c)
     for i in range(3):
         elem, cnt = c.popitem()
         self.assertNotIn(elem, c)
     c.clear()
     self.assertEqual(c, {})
     self.assertEqual(repr(c), 'Counter()')
     self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc')
     self.assertRaises(TypeError, hash, c)
     c.update(dict(a=5, b=3))
     c.update(c=1)
     c.update(Counter('a' * 50 + 'b' * 30))
     c.update()  # test case with no args
     c.__init__('a' * 500 + 'b' * 300)
     c.__init__('cdc')
     c.__init__()
     self.assertEqual(c, dict(a=555, b=333, c=3, d=1))
     self.assertEqual(c.setdefault('d', 5), 1)
     self.assertEqual(c['d'], 1)
     self.assertEqual(c.setdefault('e', 5), 5)
     self.assertEqual(c['e'], 5)
Пример #16
0
 def test_invariant_for_the_in_operator(self):
     c = Counter(a=10, b=-2, c=0)
     for elem in c:
         self.assertTrue(elem in c)
         self.assertIn(elem, c)