def test_mr_reduce(self): stdin = StringIO("\n".join(["foo\tbar\tbar1", "baz\tbad\tbad1\tbad2"])) stdout = StringIO() def process(key, vals): return [[key, len(list(vals)[0])]] mr_reduce(process, fd=stdin, out=stdout) self.assertEqual(stdout.getvalue(), "foo\t2\nbaz\t3\n")
def join_things(fields, deleted=False, spam=True): """A reducer that joins thing table dumps and data table dumps""" # Because of how Python handles scope, if we want to modify these outside # the closure function below, they need to be inside a mutable object. # http://stackoverflow.com/a/23558809/120999 counters = { 'processed': 0, 'skipped': 0, } def process(thing_id, vals): data = {} thing = None for val in vals: if val[0] == 'thing': thing = format_dataspec(val, ['data_type', # e.g. 'thing' 'thing_type', # e.g. 'link' 'ups', 'downs', 'deleted', 'spam', 'timestamp']) elif val[0] == 'data': val = format_dataspec(val, ['data_type', # e.g. 'data' 'thing_type', # e.g. 'link' 'key', # e.g. 'sr_id' 'value']) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == 'f') and (spam or thing.spam == 'f') # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields)): counters['processed'] += 1 yield ((thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields)) else: counters['skipped'] += 1 mr_reduce(process) # Print to stderr to avoid getting this caught up in the pipe of # compute_time_listings. print >> sys.stderr, '%s items processed, %s skipped' % ( counters['processed'], counters['skipped'])
def test_mr_reduce(self): stdin = StringIO("\n".join([ "foo\tbar\tbar1", "baz\tbad\tbad1\tbad2", ])) stdout = StringIO() def process(key, vals): return [[key, len(list(vals)[0])]] mr_reduce(process, fd=stdin, out=stdout) self.assertEqual(stdout.getvalue(), "foo\t2\nbaz\t3\n")
def join_things(fields, deleted=False, spam=True): """A reducer that joins thing table dumps and data table dumps""" def process(thing_id, vals): data = {} thing = None for val in vals: if val[0] == 'thing': thing = format_dataspec( val, [ 'data_type', # e.g. 'thing' 'thing_type', # e.g. 'link' 'ups', 'downs', 'deleted', 'spam', 'timestamp' ]) elif val[0] == 'data': val = format_dataspec( val, [ 'data_type', # e.g. 'data' 'thing_type', # e.g. 'link' 'key', # e.g. 'sr_id' 'value' ]) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == 'f') and (spam or thing.spam == 'f') # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields)): yield ((thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields)) mr_reduce(process)
def join_things(fields, deleted=False, spam=True): """A reducer that joins thing table dumps and data table dumps""" def process(thing_id, vals): data = {} thing = None for val in vals: if val[0] == "thing": thing = format_dataspec( val, [ "data_type", # e.g. 'thing' "thing_type", # e.g. 'link' "ups", "downs", "deleted", "spam", "timestamp", ], ) elif val[0] == "data": val = format_dataspec( val, ["data_type", "thing_type", "key", "value"] # e.g. 'data' # e.g. 'link' # e.g. 'sr_id' ) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == "f") and (spam or thing.spam == "f") # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields) ): yield ( (thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields) ) mr_reduce(process)
def join_things(fields, deleted=False, spam=True): """A reducer that joins thing table dumps and data table dumps""" def process(thing_id, vals): data = {} thing = None for val in vals: if val[0] == 'thing': thing = format_dataspec(val, ['data_type', # e.g. 'thing' 'thing_type', # e.g. 'link' 'ups', 'downs', 'deleted', 'spam', 'timestamp']) elif val[0] == 'data': val = format_dataspec(val, ['data_type', # e.g. 'data' 'thing_type', # e.g. 'link' 'key', # e.g. 'sr_id' 'value']) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == 'f') and (spam or thing.spam == 'f') # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields)): yield ((thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields)) mr_reduce(process)
def join_things( fields, deleted=False, spam=True, fd=STDIN, out=STDOUT, err=STDERR, defaults=None, ): """A reducer that joins thing table dumps and data table dumps :param list fields: list of data fields that the resulting thing must contain. Any things that missing these any of these fields (unless provided in the dump or by :param:`defaults`) will be silently dropped. :param bool deleted: Allow deleted items. :param bool spam: Allow spam items. :param file fd: Input stream. :param file out: Output stream. :param file err: Error stream. :param defaults: mapping of fieldnames to default values if not provided in the input stream. :type defaults: dict or None """ # Because of how Python handles scope, if we want to modify these outside # the closure function below, they need to be inside a mutable object. # http://stackoverflow.com/a/23558809/120999 counters = { 'processed': 0, 'skipped': 0, } def process(thing_id, vals): data = {} if defaults: data.update(defaults) thing = None for val in vals: if val[0] == 'thing': thing = format_dataspec(val, ['data_type', # e.g. 'thing' 'thing_type', # e.g. 'link' 'ups', 'downs', 'deleted', 'spam', 'timestamp']) elif val[0] == 'data': val = format_dataspec(val, ['data_type', # e.g. 'data' 'thing_type', # e.g. 'link' 'key', # e.g. 'sr_id' 'value']) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == 'f') and (spam or thing.spam == 'f') # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields)): counters['processed'] += 1 yield ((thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields)) else: counters['skipped'] += 1 mr_reduce(process, fd=fd, out=out) # Print to stderr to avoid getting this caught up in the pipe of # compute_time_listings. err.write( '%s items processed, %s skipped\n' % ( counters['processed'], counters['skipped'] ) )
def join_things( fields, deleted=False, spam=True, fd=STDIN, out=STDOUT, err=STDERR, defaults=None, ): """A reducer that joins thing table dumps and data table dumps :param list fields: list of data fields that the resulting thing must contain. Any things that missing these any of these fields (unless provided in the dump or by :param:`defaults`) will be silently dropped. :param bool deleted: Allow deleted items. :param bool spam: Allow spam items. :param file fd: Input stream. :param file out: Output stream. :param file err: Error stream. :param defaults: mapping of fieldnames to default values if not provided in the input stream. :type defaults: dict or None """ # Because of how Python handles scope, if we want to modify these outside # the closure function below, they need to be inside a mutable object. # http://stackoverflow.com/a/23558809/120999 counters = { 'processed': 0, 'skipped': 0, } def process(thing_id, vals): data = {} if defaults: data.update(defaults) thing = None for val in vals: if val[0] == 'thing': thing = format_dataspec( val, [ 'data_type', # e.g. 'thing' 'thing_type', # e.g. 'link' 'ups', 'downs', 'deleted', 'spam', 'timestamp' ]) elif val[0] == 'data': val = format_dataspec( val, [ 'data_type', # e.g. 'data' 'thing_type', # e.g. 'link' 'key', # e.g. 'sr_id' 'value' ]) if val.key in fields: data[val.key] = val.value if ( # silently ignore if we didn't see the 'thing' row thing is not None # remove spam and deleted as appriopriate and (deleted or thing.deleted == 'f') and (spam or thing.spam == 'f') # and silently ignore items that don't have all of the # data that we need and all(field in data for field in fields)): counters['processed'] += 1 yield ((thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp) + tuple(data[field] for field in fields)) else: counters['skipped'] += 1 mr_reduce(process, fd=fd, out=out) # Print to stderr to avoid getting this caught up in the pipe of # compute_time_listings. err.write('%s items processed, %s skipped\n' % (counters['processed'], counters['skipped']))