def __init__(self, **kwargs): kwargs['args'] = ['--input-protocol', 'repr', '--python-archive', Package.create()] + \ kwargs.get('args', []) super(Propagate, self).__init__(**kwargs) self.network = getattr(Network.Network, self.options.network) if any(self.args) and not Propagate._initialized: Propagate._initialized = True self.options.python_archives.append(Package.create())
def __init__(self, **kwargs): # Note that EMR is required for Schimmy propagation kwargs['args'] = \ ['--input-protocol', 'repr', '-r', 'emr', '--hadoop-version', '0.20', '--hadoop-arg', '-partitioner', '--hadoop-arg', 'org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner', '--jobconf', 'mapred.text.key.partitioner.options=-k1,1', '--jobconf', 'map.output.key.field.separator=,'] + \ kwargs.get('args', []) super(Propagate, self).__init__(**kwargs) self.network = getattr(Network.Network, self.options.network) # In Schimmy, the #partitions is always equal to #reducers self.options.jobconf['mapred.reduce.tasks'] = self.options.partitions # Initialize exactly once by creating our initial partition files # and packaging the relevant Python scripts if any(self.args) and not Propagate._initialized: Propagate._initialized = True self.options.upload_archives.append('%s#partitions' % \ Partitions.create(self.args[0], self.options.partitions, self.partition, True)) self.options.python_archives.append(Package.create())