Пример #1
2
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))
Пример #2
1
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=pix.date < '2014-01-13'))

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id, h_sum(pix.amount), h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = list(res)
        self.assertEqual(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Пример #3
0
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id,
                     h_sum(pix.amount),
                     h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = [(ad_id, amount, count)
                   for (ad_id, amount, count), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Пример #4
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))
        pix_sites = list(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Пример #5
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id,
                     pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Пример #6
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=((pix.date < '2014-01-13') &
                                                                           (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Пример #7
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=((pix.date < '2014-01-13') &
                                       (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Пример #8
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        fields=[
                            '=$token', '%url', '+%site_id', '@cpm_millis',
                            '+#ad_id', '+$date', '+@time'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          fields=[
                              '=$token', '+@1isActive', '+%site_id', '@amount',
                              '+#account_id', '+%city', '+%2state', '+#2metro',
                              '$ip', '*keyword', '+$date'
                          ],
                          partition='date',
                          force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile='fixtures/pixel.json')
Пример #9
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Пример #10
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a) in select(imps.site_id, imps.ad_id,
                                                 where=imps.date < '2014-01-13')]
        pix_sites = [(s, a) for (s, a) in select(pix.site_id, pix.amount,
                                                 where=pix.date < '2014-01-13')]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id, pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = list(res)
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Пример #11
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis',
                                 'index int ad_id', 'index string date', 'index uint time', 'bit click',
                                 'index bit impression', 'bit conversion'],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=['wide index string token', 'index bit isActive', 'index trie site_id',
                                   'uint amount', 'index int account_id', 'index trie city', 'index trie16 state',
                                   'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'],
                          partition='date',
                          force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile='fixtures/pixel.json')
Пример #12
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        columns=[
                            'wide index string token', 'trie url',
                            'index trie site_id', 'uint cpm_millis',
                            'index int ad_id', 'index string date',
                            'index uint time', 'bit click',
                            'index bit impression', 'bit conversion'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=[
                              'wide index string token', 'index bit isActive',
                              'index trie site_id', 'uint amount',
                              'index int account_id', 'index trie city',
                              'index trie16 state', 'index int16 metro',
                              'string ip', 'lz4 keyword', 'index string date'
                          ],
                          partition='date',
                          force=True)
    pixel_hlls = Table.create(PIXELS_HLL,
                              columns=[
                                  'index bit isActive', 'index trie site_id',
                                  'index int account_id', 'index trie city',
                                  'index trie16 state', 'index string date',
                                  'binary hll'
                              ],
                              partition='date',
                              force=True)
    ips = Table.create(IPS,
                       columns=['index trie16 exchange_id', 'index uint32 ip'],
                       force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, File='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, File='fixtures/pixel.json')

    tags = ddfs.list("hustle:%s:" % IPS)
    if len(tags) == 0:
        # insert the files
        insert(ips, File='fixtures/ip.json')

    tags = ddfs.list("hustle:%s:" % PIXELS_HLL)
    if len(tags) == 0:
        # insert the files
        insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token')
Пример #13
0
    def test_create_syntax(self):
        full_columns = ['wide index uint32 x', 'index string y', 'int16 z', 'lz4 a', 'trie32 b', 'binary c']
        full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c']
        fields = Table.parse_column_specs(full_columns)
        self.assertListEqual(fields, full_fields)

        default_columns = ['wide index x', 'index int y', 'uint z', 'trie b', 'c']
        default_fields = ['=x', '+#y', '@z', '%b', 'c']
        fields = Table.parse_column_specs(default_columns)
        self.assertListEqual(fields, default_fields)
Пример #14
0
    def test_create_syntax(self):
        full_columns = [
            'wide index uint32 x', 'index string y', 'int16 z', 'lz4 a',
            'trie32 b', 'binary c'
        ]
        full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c']
        fields = Table.parse_column_specs(full_columns)
        self.assertListEqual(fields, full_fields)

        default_columns = [
            'wide index x', 'index int y', 'uint z', 'trie b', 'c'
        ]
        default_fields = ['=x', '+#y', '@z', '%b', 'c']
        fields = Table.parse_column_specs(default_columns)
        self.assertListEqual(fields, default_fields)
Пример #15
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Пример #16
0
    def test_nested_agg(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1

        newtab = select(imps.ad_id,
                        imps.date,
                        h_sum(imps.cpm_millis),
                        h_count(),
                        where=imps.date > '2014-01-22',
                        nest=True)
        results = select(*star(newtab), where=newtab)
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Пример #17
0
 def test_combo_where_on_and_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010)))
     results = list(res)
     self.assertEqual(len(results), 2)
     self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results))
Пример #18
0
 def test_overall(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4,
                  order_by='ad_id', desc=True)
     results = [a for (a, d), _ in result_iterator(res)]
     self.assertEqual(len(results), 4)
     self.assertListEqual(results, [30019, 30018, 30017, 30015])
Пример #19
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
     res.purge()
Пример #20
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Пример #21
0
 def test_combo_where_on_or_partition_ex1(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 40)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results))
Пример #22
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    pcappacket= Table.create(mmm,columns=['index int32 GMTtime','index int32 MicroTime','index int32 caplen','index int32 len','index string ethtype','index string protocol','index uint8 ttl','index uint8 sip0','index uint8 sip1','index uint8 sip2','index uint8 sip3','index uint8 dip0','index uint8 dip1','index uint8 dip2','index uint8 dip3','index uint16 sport','index uint16 dport','lz4 data'],partition=None,force=True)
    '''
    imps = Table.create(IMPS,
                        columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis',
                                 'index int ad_id', 'index string date', 'index uint time'],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=['wide index string token', 'index uint8 isActive', 'index trie site_id',
                                   'uint amount', 'index int account_id', 'index trie city', 'index trie16 state',
                                   'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'],
                          partition='date',
                          force=True)
    '''
    from ls import *
    ls('/home/xiner/hustle/pcapread/testall.pcap')
    tags = ddfs.list("hustle:%s:" % mmm)
    print len(tags)
    if len(tags) == 0:
        # insert the files
        insert(pcappacket, phile='result.json' )

    '''tags = ddfs.list("hustle:%s:" % PIXELS)
Пример #23
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Пример #24
0
    def test_cardinality_with_order_by(self):
        hll = Table.from_tag(PIXELS_HLL)
        tokens_by_date = defaultdict(set)
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                tokens_by_date[record["date"]].add(record["token"])
        result = [(date, len(tokens))
                  for date, tokens in tokens_by_date.items()]

        # Test order by date
        expects = sorted(result, key=itemgetter(0), reverse=True)
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=0,
                     desc=True)
        estimates = list(res)
        for i, (date, expected_cardinality) in enumerate(expects):
            self.assertEqual(estimates[i][0], date)
            self.checkEstimate(estimates[i][1], expected_cardinality)
        res.purge()

        # Test order by hll
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=1,
                     desc=True)
        l = list(res)
        for i in range(len(l) - 1):
            self.assertTrue(l[i][1] >= l[i + 1][1])
        res.purge()
Пример #25
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
     res.purge()
Пример #26
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        resx = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                     where=imps.date == '2014-01-27',
                     order_by=2,
                     limit=3)
        results = [c for c, _ in result_iterator(res)]
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertTrue(len(results) == min(len(sum_millis), 3))
Пример #27
0
 def test_single_int_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis)
     lowest = 0
     for (a, d, c), _ in result_iterator(res):
         self.assertLessEqual(lowest, c)
         lowest = c
Пример #28
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id, ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Пример #29
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id,
                      imps.cpm_millis,
                      where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))

        resx.purge()
Пример #30
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Пример #31
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010])))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 1)
     self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Пример #32
0
    def test_column_fn_with_agg(self):
        ips = Table.from_tag(IPS)
        res = select(ips.exchange_id,
                     h_max(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['192.168.1.1'] * 4, ipss)

        res = select(ips.exchange_id,
                     h_min(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['127.0.0.1'] * 4, ipss)
Пример #33
0
    def test_bit_values(self):
        pix = Table.from_tag(PIXELS)
        res = select(pix.isActive, where=pix.isActive == 1)
        actives = 0
        for (act, ) in res:
            actives += act

        self.assertEqual(actives, 234)
Пример #34
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id,
                  ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Пример #35
0
 def test_distinct(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  where=imps.date == '2014-01-27',
                  distinct=True)
     results = list(res)
     self.assertEqual(len(results), 8)
Пример #36
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Пример #37
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Пример #38
0
    def test_aggregate(self):
        imps = Table.from_tag(IMPS)
        res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps)

        (clicks, conversions, impressions) = list(res)[0]

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Пример #39
0
 def test_low_limit(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date == '2014-01-27',
                  limit=4)
     results = list(res)
     self.assertEqual(len(results), 4)
Пример #40
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
Пример #41
0
 def test_column_fn_with_distinct(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),),
                  distinct=True)
     results = list(res)
     res.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'],
                          ipss)
Пример #42
0
 def get_result_schema(self, project):
     from hustle import Table
     fields = []
     for col in project:
         col = col.column
         if col.name not in fields:
             fields.append(col.schema_string())
     # print "GEEWHIZ: %s %s %s" % (indicies, fields, blobs)
     name = '-'.join([w._name for w in self.wheres])[:64]
     return Table(name="sub-%s" % name, fields=fields)
Пример #43
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(
         all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
Пример #44
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Пример #45
0
 def test_combo_where_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=((imps.date >= '2014-01-20') &
                         (imps.ad_id == 30010)))
     results = list(res)
     self.assertEqual(len(results), 6)
     self.assertTrue(
         all(d >= '2014-01-20' and a == 30010 for a, d, _ in results))
Пример #46
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) &
                         (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 1)
     self.assertTrue(
         all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Пример #47
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Пример #48
0
 def test_column_fn_with_distinct(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx",
                  order_by=(ip_ntoa(ips.ip), ),
                  distinct=True)
     results = list(res)
     res.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(
         ['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], ipss)
Пример #49
0
def ensure_tables():
    overrides["server"] = "disco://localhost"
    overrides["dump"] = False
    overrides["nest"] = False
    settings = Settings()
    ddfs = settings["ddfs"]

    imps = Table.create(
        IMPS,
        fields=["=$token", "%url", "+%site_id", "@cpm_millis", "+#ad_id", "+$date", "+@time"],
        partition="date",
        force=True,
    )
    pixels = Table.create(
        PIXELS,
        fields=[
            "=$token",
            "+@1isActive",
            "+%site_id",
            "@amount",
            "+#account_id",
            "+%city",
            "+%2state",
            "+#2metro",
            "$ip",
            "*keyword",
            "+$date",
        ],
        partition="date",
        force=True,
    )

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile="fixtures/imps.json", preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile="fixtures/pixel.json")
Пример #50
0
    def test_project(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.click, imps.conversion, imps.impression, where=imps)
        clicks = conversions = impressions = 0
        for (click, conv, imp) in res:
            clicks += click
            conversions += conv
            impressions += imp

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Пример #51
0
    def test_overflow(self):
        from itertools import izip

        imps = Table.from_tag(IMPS)
        fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date)

        nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True)
        nest_results = select(*star(nest_tab), where=nest_tab, order_by=0)

        for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results):
            self.assertEqual(fdate, ndate)
            self.assertEqual(fimps, nimps)
Пример #52
0
 def test_delete_partial(self):
     delete(self.table.date >= '2014-01-13')
     self.assertEqual(['hustle:__test_drop_imps:2014-01-10',
                       'hustle:__test_drop_imps:2014-01-11',
                       'hustle:__test_drop_imps:2014-01-12'],
                      get_partitions(self.table))
     tags = self.ddfs.list(Table.base_tag(self.table._name))
     self.assertEqual(len(tags), 4)
     self.assertIn("hustle:__test_drop_imps", tags)
     drop(self.table)
     with self.assertRaises(ValueError):
         delete(self.table.site_id == 'foobar')
         delete(self.tale.url)
Пример #53
0
 def test_combo_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=imps.date > '2014-01-21',
                  order_by=(imps.date, imps.cpm_millis))
     lowest_cpm = 0
     lowest_date = '2000-01-01'
     for (a, d, c), _ in result_iterator(res):
         if lowest_date == d:
             self.assertLessEqual(lowest_cpm, c)
             lowest_cpm = c
         else:
             self.assertLessEqual(lowest_date, d)
             lowest_date = d
             lowest_cpm = c
Пример #54
0
 def test_combo_descending(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=imps.date > '2014-01-21',
                  order_by=(imps.date, imps.cpm_millis),
                  desc=True)
     highest_cpm = 1000000000
     highest_date = '2222-01-01'
     for (a, d, c), _ in result_iterator(res):
         if highest_date == d:
             self.assertGreaterEqual(highest_cpm, c)
             highest_cpm = c
         else:
             self.assertGreaterEqual(highest_date, d)
             highest_date = d
             highest_cpm = c
Пример #55
0
    def test_simple_aggregation(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in results:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
        self.assertGreater(len(list(results)), 0)
        for ad_id, millis, count in results:
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Пример #56
-1
    def test_multiple_group_bys(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1
        results.purge()

        results = select(imps.ad_id,
                         imps.date,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date > '2014-01-22')
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        results.purge()