Exemplo n.º 1
2
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))
Exemplo n.º 2
1
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=pix.date < '2014-01-13'))

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id, h_sum(pix.amount), h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = list(res)
        self.assertEqual(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Exemplo n.º 3
1
    def test_nested_agg(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1

        newtab = select(imps.ad_id,
                        imps.date,
                        h_sum(imps.cpm_millis),
                        h_count(),
                        where=imps.date > '2014-01-22',
                        nest=True)
        results = select(*star(newtab), where=newtab)
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Exemplo n.º 4
1
    def test_overflow(self):
        from itertools import izip

        imps = Table.from_tag(IMPS)
        fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date)

        nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True)
        nest_results = select(*star(nest_tab), where=nest_tab, order_by=0)

        for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results):
            self.assertEqual(fdate, ndate)
            self.assertEqual(fimps, nimps)
Exemplo n.º 5
0
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id,
                     h_sum(pix.amount),
                     h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = [(ad_id, amount, count)
                   for (ad_id, amount, count), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Exemplo n.º 6
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))
        pix_sites = list(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Exemplo n.º 7
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a) in select(imps.site_id, imps.ad_id,
                                                 where=imps.date < '2014-01-13')]
        pix_sites = [(s, a) for (s, a) in select(pix.site_id, pix.amount,
                                                 where=pix.date < '2014-01-13')]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id, pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = list(res)
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Exemplo n.º 8
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Exemplo n.º 9
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=((pix.date < '2014-01-13') &
                                       (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Exemplo n.º 10
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis',
                                 'index int ad_id', 'index string date', 'index uint time', 'bit click',
                                 'index bit impression', 'bit conversion'],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=['wide index string token', 'index bit isActive', 'index trie site_id',
                                   'uint amount', 'index int account_id', 'index trie city', 'index trie16 state',
                                   'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'],
                          partition='date',
                          force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile='fixtures/pixel.json')
Exemplo n.º 11
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id,
                     pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Exemplo n.º 12
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=((pix.date < '2014-01-13') &
                                                                           (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Exemplo n.º 13
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        fields=[
                            '=$token', '%url', '+%site_id', '@cpm_millis',
                            '+#ad_id', '+$date', '+@time'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          fields=[
                              '=$token', '+@1isActive', '+%site_id', '@amount',
                              '+#account_id', '+%city', '+%2state', '+#2metro',
                              '$ip', '*keyword', '+$date'
                          ],
                          partition='date',
                          force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile='fixtures/pixel.json')
Exemplo n.º 14
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        columns=[
                            'wide index string token', 'trie url',
                            'index trie site_id', 'uint cpm_millis',
                            'index int ad_id', 'index string date',
                            'index uint time', 'bit click',
                            'index bit impression', 'bit conversion'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=[
                              'wide index string token', 'index bit isActive',
                              'index trie site_id', 'uint amount',
                              'index int account_id', 'index trie city',
                              'index trie16 state', 'index int16 metro',
                              'string ip', 'lz4 keyword', 'index string date'
                          ],
                          partition='date',
                          force=True)
    pixel_hlls = Table.create(PIXELS_HLL,
                              columns=[
                                  'index bit isActive', 'index trie site_id',
                                  'index int account_id', 'index trie city',
                                  'index trie16 state', 'index string date',
                                  'binary hll'
                              ],
                              partition='date',
                              force=True)
    ips = Table.create(IPS,
                       columns=['index trie16 exchange_id', 'index uint32 ip'],
                       force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, File='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, File='fixtures/pixel.json')

    tags = ddfs.list("hustle:%s:" % IPS)
    if len(tags) == 0:
        # insert the files
        insert(ips, File='fixtures/ip.json')

    tags = ddfs.list("hustle:%s:" % PIXELS_HLL)
    if len(tags) == 0:
        # insert the files
        insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token')
Exemplo n.º 15
0
    def test_create_syntax(self):
        full_columns = ['wide index uint32 x', 'index string y', 'int16 z', 'lz4 a', 'trie32 b', 'binary c']
        full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c']
        fields = Table.parse_column_specs(full_columns)
        self.assertListEqual(fields, full_fields)

        default_columns = ['wide index x', 'index int y', 'uint z', 'trie b', 'c']
        default_fields = ['=x', '+#y', '@z', '%b', 'c']
        fields = Table.parse_column_specs(default_columns)
        self.assertListEqual(fields, default_fields)
Exemplo n.º 16
0
    def test_create_syntax(self):
        full_columns = [
            'wide index uint32 x', 'index string y', 'int16 z', 'lz4 a',
            'trie32 b', 'binary c'
        ]
        full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c']
        fields = Table.parse_column_specs(full_columns)
        self.assertListEqual(fields, full_fields)

        default_columns = [
            'wide index x', 'index int y', 'uint z', 'trie b', 'c'
        ]
        default_fields = ['=x', '+#y', '@z', '%b', 'c']
        fields = Table.parse_column_specs(default_columns)
        self.assertListEqual(fields, default_fields)
Exemplo n.º 17
0
 def test_combo_where_on_and_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010)))
     results = list(res)
     self.assertEqual(len(results), 2)
     self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results))
Exemplo n.º 18
0
 def test_combo_where_on_or_partition_ex1(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 40)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results))
Exemplo n.º 19
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Exemplo n.º 20
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Exemplo n.º 21
0
 def test_overall(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4,
                  order_by='ad_id', desc=True)
     results = [a for (a, d), _ in result_iterator(res)]
     self.assertEqual(len(results), 4)
     self.assertListEqual(results, [30019, 30018, 30017, 30015])
Exemplo n.º 22
0
 def test_single_int_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis)
     lowest = 0
     for (a, d, c), _ in result_iterator(res):
         self.assertLessEqual(lowest, c)
         lowest = c
Exemplo n.º 23
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    pcappacket= Table.create(mmm,columns=['index int32 GMTtime','index int32 MicroTime','index int32 caplen','index int32 len','index string ethtype','index string protocol','index uint8 ttl','index uint8 sip0','index uint8 sip1','index uint8 sip2','index uint8 sip3','index uint8 dip0','index uint8 dip1','index uint8 dip2','index uint8 dip3','index uint16 sport','index uint16 dport','lz4 data'],partition=None,force=True)
    '''
    imps = Table.create(IMPS,
                        columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis',
                                 'index int ad_id', 'index string date', 'index uint time'],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=['wide index string token', 'index uint8 isActive', 'index trie site_id',
                                   'uint amount', 'index int account_id', 'index trie city', 'index trie16 state',
                                   'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'],
                          partition='date',
                          force=True)
    '''
    from ls import *
    ls('/home/xiner/hustle/pcapread/testall.pcap')
    tags = ddfs.list("hustle:%s:" % mmm)
    print len(tags)
    if len(tags) == 0:
        # insert the files
        insert(pcappacket, phile='result.json' )

    '''tags = ddfs.list("hustle:%s:" % PIXELS)
Exemplo n.º 24
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
     res.purge()
Exemplo n.º 25
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        resx = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                     where=imps.date == '2014-01-27',
                     order_by=2,
                     limit=3)
        results = [c for c, _ in result_iterator(res)]
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertTrue(len(results) == min(len(sum_millis), 3))
Exemplo n.º 26
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Exemplo n.º 27
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id, ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Exemplo n.º 28
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Exemplo n.º 29
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010])))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 1)
     self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Exemplo n.º 30
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id,
                      imps.cpm_millis,
                      where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))

        resx.purge()
Exemplo n.º 31
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
     res.purge()
Exemplo n.º 32
0
    def test_column_fn_with_agg(self):
        ips = Table.from_tag(IPS)
        res = select(ips.exchange_id,
                     h_max(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['192.168.1.1'] * 4, ipss)

        res = select(ips.exchange_id,
                     h_min(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['127.0.0.1'] * 4, ipss)
Exemplo n.º 33
0
    def test_cardinality_with_order_by(self):
        hll = Table.from_tag(PIXELS_HLL)
        tokens_by_date = defaultdict(set)
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                tokens_by_date[record["date"]].add(record["token"])
        result = [(date, len(tokens))
                  for date, tokens in tokens_by_date.items()]

        # Test order by date
        expects = sorted(result, key=itemgetter(0), reverse=True)
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=0,
                     desc=True)
        estimates = list(res)
        for i, (date, expected_cardinality) in enumerate(expects):
            self.assertEqual(estimates[i][0], date)
            self.checkEstimate(estimates[i][1], expected_cardinality)
        res.purge()

        # Test order by hll
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=1,
                     desc=True)
        l = list(res)
        for i in range(len(l) - 1):
            self.assertTrue(l[i][1] >= l[i + 1][1])
        res.purge()
Exemplo n.º 34
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Exemplo n.º 35
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Exemplo n.º 36
0
    def test_bit_values(self):
        pix = Table.from_tag(PIXELS)
        res = select(pix.isActive, where=pix.isActive == 1)
        actives = 0
        for (act, ) in res:
            actives += act

        self.assertEqual(actives, 234)
Exemplo n.º 37
0
 def test_distinct(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  where=imps.date == '2014-01-27',
                  distinct=True)
     results = list(res)
     self.assertEqual(len(results), 8)
Exemplo n.º 38
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id,
                  ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Exemplo n.º 39
0
    def test_aggregate(self):
        imps = Table.from_tag(IMPS)
        res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps)

        (clicks, conversions, impressions) = list(res)[0]

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Exemplo n.º 40
0
 def test_low_limit(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date == '2014-01-27',
                  limit=4)
     results = list(res)
     self.assertEqual(len(results), 4)
Exemplo n.º 41
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
Exemplo n.º 42
0
 def get_result_schema(self, project):
     from hustle import Table
     fields = []
     for col in project:
         col = col.column
         if col.name not in fields:
             fields.append(col.schema_string())
     # print "GEEWHIZ: %s %s %s" % (indicies, fields, blobs)
     name = '-'.join([w._name for w in self.wheres])[:64]
     return Table(name="sub-%s" % name, fields=fields)
Exemplo n.º 43
0
 def test_column_fn_with_distinct(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),),
                  distinct=True)
     results = list(res)
     res.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'],
                          ipss)
Exemplo n.º 44
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(
         all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
Exemplo n.º 45
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Exemplo n.º 46
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) &
                         (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 1)
     self.assertTrue(
         all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Exemplo n.º 47
0
 def test_column_fn_with_distinct(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx",
                  order_by=(ip_ntoa(ips.ip), ),
                  distinct=True)
     results = list(res)
     res.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(
         ['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], ipss)
Exemplo n.º 48
0
 def test_combo_where_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=((imps.date >= '2014-01-20') &
                         (imps.ad_id == 30010)))
     results = list(res)
     self.assertEqual(len(results), 6)
     self.assertTrue(
         all(d >= '2014-01-20' and a == 30010 for a, d, _ in results))
Exemplo n.º 49
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Exemplo n.º 50
0
def ensure_tables():
    overrides["server"] = "disco://localhost"
    overrides["dump"] = False
    overrides["nest"] = False
    settings = Settings()
    ddfs = settings["ddfs"]

    imps = Table.create(
        IMPS,
        fields=["=$token", "%url", "+%site_id", "@cpm_millis", "+#ad_id", "+$date", "+@time"],
        partition="date",
        force=True,
    )
    pixels = Table.create(
        PIXELS,
        fields=[
            "=$token",
            "+@1isActive",
            "+%site_id",
            "@amount",
            "+#account_id",
            "+%city",
            "+%2state",
            "+#2metro",
            "$ip",
            "*keyword",
            "+$date",
        ],
        partition="date",
        force=True,
    )

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile="fixtures/imps.json", preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile="fixtures/pixel.json")
Exemplo n.º 51
0
    def test_project(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.click, imps.conversion, imps.impression, where=imps)
        clicks = conversions = impressions = 0
        for (click, conv, imp) in res:
            clicks += click
            conversions += conv
            impressions += imp

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Exemplo n.º 52
0
 def test_delete_partial(self):
     delete(self.table.date >= '2014-01-13')
     self.assertEqual(['hustle:__test_drop_imps:2014-01-10',
                       'hustle:__test_drop_imps:2014-01-11',
                       'hustle:__test_drop_imps:2014-01-12'],
                      get_partitions(self.table))
     tags = self.ddfs.list(Table.base_tag(self.table._name))
     self.assertEqual(len(tags), 4)
     self.assertIn("hustle:__test_drop_imps", tags)
     drop(self.table)
     with self.assertRaises(ValueError):
         delete(self.table.site_id == 'foobar')
         delete(self.tale.url)
Exemplo n.º 53
0
 def test_combo_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=imps.date > '2014-01-21',
                  order_by=(imps.date, imps.cpm_millis))
     lowest_cpm = 0
     lowest_date = '2000-01-01'
     for (a, d, c), _ in result_iterator(res):
         if lowest_date == d:
             self.assertLessEqual(lowest_cpm, c)
             lowest_cpm = c
         else:
             self.assertLessEqual(lowest_date, d)
             lowest_date = d
             lowest_cpm = c
Exemplo n.º 54
0
 def test_combo_descending(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=imps.date > '2014-01-21',
                  order_by=(imps.date, imps.cpm_millis),
                  desc=True)
     highest_cpm = 1000000000
     highest_date = '2222-01-01'
     for (a, d, c), _ in result_iterator(res):
         if highest_date == d:
             self.assertGreaterEqual(highest_cpm, c)
             highest_cpm = c
         else:
             self.assertGreaterEqual(highest_date, d)
             highest_date = d
             highest_cpm = c
Exemplo n.º 55
0
    def test_simple_aggregation(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in results:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
        self.assertGreater(len(list(results)), 0)
        for ad_id, millis, count in results:
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Exemplo n.º 56
-1
    def test_multiple_group_bys(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1
        results.purge()

        results = select(imps.ad_id,
                         imps.date,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date > '2014-01-22')
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        results.purge()