def test_result_iterator(self): # sort=default rule = InfernoRule() eq_(rule.result_iterator, sorted_iterator) # sort=True rule = InfernoRule(sort=True) eq_(rule.result_iterator, sorted_iterator) # sort=False rule = InfernoRule(sort=False) eq_(rule.result_iterator, result_iterator)
def setUp(self): self.rule = InfernoRule( day_range=3, day_offset=1, day_start=date(2012, 12, 02), source_tags=['tag1', 'tag2'], result_tag='result_tag_rule') self.settings = InfernoSettings( day_range=4, day_offset=2, day_start=date(2011, 12, 02), source_tags=['tag3', 'tag4'], result_tag='result_tag_settings') # expected results self.result_tag_from_rule = 'result_tag_rule' self.result_tag_from_settings = 'result_tag_settings' self.tags_from_rule = [ 'tag1:2012-12-01', 'tag1:2012-11-30', 'tag1:2012-11-29', 'tag2:2012-12-01', 'tag2:2012-11-30', 'tag2:2012-11-29'] self.tags_from_settings = ['tag3', 'tag4']
def test_source_tags(self): # list rule = InfernoRule(source_tags=['tag1', 'tag2']) eq_(rule.source_tags, ['tag1', 'tag2']) # empty list rule = InfernoRule(source_tags=[]) eq_(rule.source_tags, []) # one tag (string) rule = InfernoRule(source_tags='tag1') eq_(rule.source_tags, ['tag1']) # none tag rule = InfernoRule(source_tags=None) eq_(rule.source_tags, [])
def test_map_serialization(self): # key parts are str casted & json serialized, value parts are are not # (note the difference between the key date and value date results) rule = InfernoRule(key_parts=['date'], value_parts=['date']) expected = [('["_default","2012-12-01"]', [datetime.date(2012, 12, 1)])] self._assert_map(self.data, rule, expected)
def test_tags_from_settings_and_rule_mix(self): rule = InfernoRule(source_tags=['tag5'], day_range=2) settings = InfernoSettings(day_start=date(2011, 12, 01)) actual = JobOptions(rule, settings).tags # even though day_range=2, we only expect 1 day, as the settings will override # the rule expected = ['tag5:2011-12-01'] eq_(actual, expected)
def test_parts_preprocess(self): def foo(parts, params): parts['bar'] = 1 yield parts rule = InfernoRule(parts_preprocess=[foo]) eq_(rule.params.parts_preprocess, [foo]) actual = rule.params.parts_preprocess[0]({'hello': 'world'}, None) eq_(list(actual), [{'bar': 1, 'hello': 'world'}])
def setUp(self): self.data = { 'city': 'toronto', 'country': 'canada', 'population': 100, 'size': 1000, 'date': datetime.date(2012, 12, 01) } self.rule = InfernoRule(key_parts=['country', 'city'], value_parts=['population', 'size'])
def setUp(self): settings = InfernoSettings(day_range=2, day_start=date(2011, 11, 12)) rule = InfernoRule(archive=True, max_blobs=self.MAX_BLOBS, name='some_rule_name', archive_tag_prefix='archived', source_tags=['incoming:data:chunk']) self.job = InfernoJob(rule, settings) self.job.disco = Disco() self.job.ddfs = DDFS()
def test_keyset_parts_preprocess(self): def foo(parts, params): parts['bar'] = 1 yield parts rule = InfernoRule(keysets={ 'keyset1': Keyset(parts_preprocess=[foo]), }) funcs = rule.params.keysets['keyset1']['parts_preprocess'] eq_(funcs, [foo]) actual = funcs[0]({'hello': 'world'}, None) eq_(list(actual), [{'bar': 1, 'hello': 'world'}])
def test_field_transforms(self): def upper(val): return val.upper() rule = InfernoRule(key_parts=['country', 'city'], value_parts=['population', 'size'], field_transforms={ 'city': upper, 'country': upper }) expected = [('["_default","CANADA","TORONTO"]', [100, 1000])] self._assert_map(self.data, rule, expected)
def test_parts_preprocess_that_yields_multiple_parts(self): def lookup_language(parts, params): for language in ['french', 'english']: parts_copy = parts.copy() parts_copy['language'] = language yield parts_copy rule = InfernoRule(key_parts=['country'], value_parts=['language'], parts_preprocess=[lookup_language]) expected = [('["_default","canada"]', ['french']), ('["_default","canada"]', ['english'])] self._assert_map(self.data, rule, expected)
def test_field_transforms_happen_after_parts_preprocess(self): def lookup_language(parts, params): for language in ['french', 'english']: parts_copy = parts.copy() parts_copy['language'] = language yield parts_copy def upper(val): return val.upper() rule = InfernoRule(key_parts=['country'], value_parts=['language'], parts_preprocess=[lookup_language], field_transforms={'language': upper}) expected = [('["_default","canada"]', ['FRENCH']), ('["_default","canada"]', ['ENGLISH'])] self._assert_map(self.data, rule, expected)
def test_tags_from_settings(self): actual = JobOptions(InfernoRule(), self.settings).tags eq_(actual, self.tags_from_settings)
def test_kwargs(self): rule = InfernoRule(some_extra_param='some_extra_value') eq_(rule.params.some_extra_param, 'some_extra_value')
def test_str(self): rule = InfernoRule(name='some_rule_name') eq_(str(rule), '<InfernoRule: some_rule_name>')
def test_keysets(self): # # no key sets # rule = InfernoRule() # eq_(rule.params.keysets, {}) # one key set rule = InfernoRule(key_parts=['id'], value_parts=['count'], table='some_table', column_mappings={'id': 'some_id'}) keysets = { '_default': { 'column_mappings': { 'id': 'some_id' }, 'table': 'some_table', 'value_parts': ['count'], 'key_parts': ['_keyset', 'id'], 'parts_preprocess': [], 'parts_postprocess': [] } } eq_(rule.params.keysets, keysets) # many key sets rule = InfernoRule( keysets={ 'keyset1': Keyset(key_parts=['id1'], value_parts=['count1'], column_mappings={'id1': 'some_id1'}, table='some_table1'), 'keyset2': Keyset(key_parts=['id2'], value_parts=['count2'], column_mappings={'id2': 'some_id2'}, table='some_table2') }) keysets = { 'keyset1': { 'column_mappings': { 'id1': 'some_id1' }, 'table': 'some_table1', 'value_parts': ['count1'], 'key_parts': ['_keyset', 'id1'], 'parts_preprocess': [], 'parts_postprocess': [], }, 'keyset2': { 'column_mappings': { 'id2': 'some_id2' }, 'table': 'some_table2', 'value_parts': ['count2'], 'key_parts': ['_keyset', 'id2'], 'parts_preprocess': [], 'parts_postprocess': [], }, } eq_(rule.params.keysets, keysets)
yield cparts except: print "Error parsing tiles: %s" % str(tiles) def filter_all(parts, params, **kwargs): for col, val in kwargs.items(): if col and parts[col] != val: return yield parts def filter_clicks(keys, vals, params, threshold=1): if vals[0] > threshold: yield keys, vals RULES = [ InfernoRule( name='ip_click_counter', source_tags=['incoming:impression'], map_input_stream=chunk_json_stream, parts_preprocess=[clean_data, parse_tiles, partial(filter_all, tile_id=504), count], partitions=32, sort_buffer_size='25%', combiner_function=combiner, key_parts=['ip'], value_parts=['count'], parts_postprocess=[partial(filter_clicks, threshold=5)], ), ]
def setUp(self): self.settings = InfernoSettings() self._make_temp_pid_dir() self.job = InfernoJob(InfernoRule(name='some_rule_name'), {}, Params()) self.pid_dir = pid.pid_dir(self.settings)
def test_empty_rule_and_empty_settings(self): job_options = JobOptions(InfernoRule(), InfernoSettings()) eq_(job_options.tags, []) eq_(job_options.result_tag, None)
InfernoRule( name='enhanced_stats', source_tags=['incoming:impression'], day_range=1, map_input_stream=chunk_json_stream, map_init_function=impression_stats_init, parts_preprocess=[ clean_data, parse_date, parse_locale, parse_ip, parse_ua, parse_tiles, filter_enhanced ], geoip_file=GEOIP, partitions=32, sort_buffer_size='25%', locale_whitelist={ 'ach', 'af', 'an', 'ar', 'as', 'ast', 'az', 'be', 'bg', 'bn-bd', 'bn-in', 'br', 'bs', 'ca', 'cs', 'csb', 'cy', 'da', 'de', 'el', 'en-gb', 'en-us', 'en-za', 'eo', 'es-ar', 'es-cl', 'es-es', 'es-mx', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy-nl', 'ga-ie', 'gd', 'gl', 'gu-in', 'he', 'hi-in', 'hr', 'hu', 'hsb', 'hy-am', 'id', 'is', 'it', 'ja', 'ja-jp-mac', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'lij', 'lt', 'lv', 'mai', 'mk', 'ml', 'mr', 'ms', 'my', 'nb-no', 'nl', 'nn-no', 'oc', 'or', 'pa-in', 'pl', 'pt-br', 'pt-pt', 'rm', 'ro', 'ru', 'si', 'sk', 'sl', 'son', 'sq', 'sr', 'sv-se', 'sw', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'vi', 'xh', 'zh-cn', 'zh-tw', 'zu' }, combiner_function=combiner, keysets={ 'impression_stats': Keyset( key_parts=['date', 'locale', 'tile_id', 'country_code'], value_parts=[ 'impressions', 'clicks', 'pinned', 'blocked', 'sponsored', 'sponsored_link' ], ), }, ),
def test_explict_tags(self): rule = InfernoRule(source_tags=['tag:foo', 'tag:bar']) settings = InfernoSettings() actual = JobOptions(rule, settings).tags expected = ['tag:foo', 'tag:bar'] eq_(actual, expected)
def test_result_tag_from_settings(self): actual = JobOptions(InfernoRule(), self.settings).result_tag eq_(actual, self.result_tag_from_settings)
from inferno.lib.rule import InfernoRule from inferno.lib.rule import Keyset RULES = [ InfernoRule( name='manual_rule_4', keysets={ 'keyset_1':Keyset( key_parts=['key_1'], value_parts=['value_1'], ), 'keyset_2':Keyset( key_parts=['key_2'], value_parts=['value_2'] ) } ), ]
def test_explict_tags_despite_day_range_on_the_rule(self): rule = InfernoRule(source_tags=['tag:foo', 'tag:bar'], day_range=2) settings = InfernoSettings(day_range=0) actual = JobOptions(rule, settings).tags expected = ['tag:foo', 'tag:bar'] eq_(actual, expected)
yield parts # an example keyset parts_preprocess that works only for a specific keyset def count_again(parts, params): parts['count'] = parts['count'] + 1 yield parts RULES = [ InfernoRule(name='last_names_json', source_tags=['example:chunk:users'], map_input_stream=chunk_json_stream, parts_preprocess=[count], partitions=2, keysets={ 'last_name_keyset': Keyset(key_parts=['last'], value_parts=['count'], parts_preprocess=[count_again]) }), InfernoRule( name='last_names_csv', source_tags=['example:chunk:users'], map_input_stream=chunk_csv_stream, csv_fields=('first', 'last'), csv_dialect='excel', parts_preprocess=[count], partitions=2, key_parts=['last'], value_parts=['count'],
InfernoRule( name='impression_stats', source_tags=['incoming:impression'], min_blobs=IMPRESSION_MIN_BLOBS, max_blobs=IMPRESSION_MAX_BLOBS, archive=True, rule_cleanup=report_rule_stats, map_input_stream=chunk_json_stream, map_init_function=impression_stats_init, parts_preprocess=[clean_data, parse_date, parse_locale, check_locale_whitelist, parse_ip, parse_ua, parse_tiles], geoip_file=GEOIP, partitions=32, sort_buffer_size='25%', locale_whitelist=LOCALE_WHITELIST, result_processor=partial(insert_redshift, host=RS_HOST, port=RS_PORT, database=RS_DB, user=RS_USER, password=RS_PASSWORD, bucket_name=RS_BUCKET), combiner_function=combiner, keysets={ 'impression_stats': Keyset( key_parts=['date', 'position', 'locale', 'tile_id', 'country_code', 'os', 'browser', 'version', 'device', 'year', 'month', 'week', 'enhanced', 'blacklisted'], value_parts=['impressions', 'clicks', 'pinned', 'blocked', 'sponsored', 'sponsored_link'], table='impression_stats_daily'), 'site_stats': Keyset( key_parts=['date', 'locale', 'country_code', 'os', 'browser', 'version', 'device', 'year', 'month', 'week', 'url'], value_parts=['impressions', 'clicks', 'pinned', 'blocked', 'sponsored', 'sponsored_link'], table='site_stats_daily', ), 'newtab_stats': Keyset( key_parts=['date', 'locale', 'country_code', 'os', 'browser', 'version', 'device', 'year', 'month', 'week'], value_parts=['newtabs'], table='newtab_stats_daily') } ),
print "pushed remote: %s" % url except Exception as e: print "failed: %s %s" % (e, url) yield unicode('["_default", "%s", "%s", "%s"]' % (e, gethostname(), url)).encode('ascii', 'ignore'), [1] RULES = [ # this rule loads data into a cluster from s3 InfernoRule( name='bulk_load', source_urls=partial( get_keys_for_pattern, bucket='tiles-incoming-prod-us-west-2', pattern=r'.+-([^-]*)-(2015\.01\.(05|06|07|08|09|10|11|12|13))', tag_expr=["processed:", 1, ":2015-01-", 3]), map_input_stream=(disco.schemes.scheme_raw.input_stream, ), map_init_function=init, map_function=s3_import_map, ), # this rule copies tags from one Disco cluster to another InfernoRule( name='copy_tags', source_tags=[], target_disco_master='disco://localhost', target_tag='', chunk=False, map_input_stream=(task_input_stream, filename_input_stream), map_function=copy_tags_map, ),
from inferno.lib.rule import chunk_json_stream from inferno.lib.rule import InfernoRule from infernyx.rules import combiner AUTO_RUN = False def count(parts, params): parts['count'] = 1 yield parts RULES = [ InfernoRule( name='busiest_ips', source_tags=['processed:impression'], day_range=1, map_input_stream=chunk_json_stream, parts_preprocess=[count], partitions=32, sort_buffer_size='25%', combiner_function=combiner, key_parts=['ip'], value_parts=['count'], ), ]
def test_field_transforms(self): def upper(val): return val.upper() rule = InfernoRule(field_transforms={'hello': upper}) eq_(rule.params.field_transforms, {'hello': upper})
def filter_site(parts, params): filter_for_site = params.filter_for_site if parts['url_a'] == filter_for_site or parts['url_b'] == filter_for_site: yield parts RULES = [ InfernoRule( name='analyze_tuples', # from the command line - override the input tags with the "-t" option source_tags=['incoming:site_tuples'], map_input_stream=chain_stream + (partial(kv_reader, keyset='tuples', keys=('keyset', 'date', 'locale', 'country_code', 'url_a', 'url_b'), values=('count', )), ), key_parts=['date'], value_parts=['count'], parts_preprocess=[filter_site], # override this on the command line with: # -P 'filter_for_site: override.org' filter_for_site='booking.com', partitions=32, sort_buffer_size='35%', ), ]