def create(filename, partitions, partitioner, fail_silently=False, tar_filename=None): """ Create a new partition. filename: input filename partitions: the number of partitions to create partitioner: a function from key to partition number fail_silently: when set, will swallow any exception encountered when opening the input file tar_filename: The filename used to create the partition archive """ make_filename = lambda partition: 'part-%05d' % partition make_path = lambda partition: '%s/%s' % (directory, make_filename(partition)) nodes = map(lambda _: [], xrange(partitions)) stream = None # Open the file, swallow if flag set try: stream = open(filename) except IOError: if not fail_silently: raise else: return None # For each line in the file, save it to its proper partition try: for line in stream: node = TabSeparatedNodeSerializer.deserialize(line) partition = partitioner(node.address) nodes[partition].append(node) finally: if stream: stream.close() # Create a temporary file directory = tempfile.mkdtemp() # For each partition, write its dataset for partition, partition_nodes in enumerate(nodes): stream = open(make_path(partition), "w") try: for node in sorted(partition_nodes, key=lambda n: n.address): stream.write(\ TabSeparatedNodeSerializer.serialize(node) + '\n') finally: if stream: stream.close() # Compress the resulting partitions tar_filename = tar_filename or tempfile.mktemp(prefix='partition', suffix='.tar.gz') tar = tarfile.open(tar_filename, "w:gz") try: map(lambda p: tar.add(make_path(p), make_filename(p)), xrange(partitions)) finally: tar.close() # Remove the scratch files map(lambda p: os.remove(make_path(p)), xrange(partitions)) return tar_filename
def partition_filename(self): """ Gets the partition filename associated with this reducer """ if "_partition_filename" not in self.__dict__ or \ self._partition_filename is None: self._partition_filename = \ PartitionUtilities.get_partition_filename( self.current_partition, self.options.partitions, lambda key: self.partition(key), lambda line: TabSeparatedNodeSerializer.deserialize(line)\ .address) return self._partition_filename
def partition_filename(self): """ Gets the partition filename associated with this reducer """ if "_partition_filename" not in self.__dict__ or \ self._partition_filename is None: self._partition_filename = \ PartitionUtilities.get_partition_filename( self.current_partition, self.options.partitions, lambda key: self.partition(key), lambda line: TabSeparatedNodeSerializer.deserialize(line)\ .address) return self._partition_filename
def execute(filename, network, nodes_to_infect, hit_list_size): """ Create a new network with the given filename. filename: output filename network: the network address space under consideration nodes_to_infect: the number of nodes to mark initially-infected hit_list_size: the initial hit-list size for infected nodes """ try: # Create our list of vulnerable nodes with tempfile.NamedTemporaryFile('w', delete=False) as file: for host in CreateVulnerableHosts.execute(network, nodes_to_infect): file.write(TabSeparatedNodeSerializer.serialize(host)+'\n') # Then run a map/reduce job that marks some nodes as infected with CreateHitLists(args=['--size',str(hit_list_size), file.name])\ .make_runner() as runner: runner.run() with open(filename, 'w') as output: map(output.write, runner.stream_output()) finally: os.remove(file.name)
def execute(filename, network, nodes_to_infect, hit_list_size): """ Create a new network with the given filename. filename: output filename network: the network address space under consideration nodes_to_infect: the number of nodes to mark initially-infected hit_list_size: the initial hit-list size for infected nodes """ try: # Create our list of vulnerable nodes with tempfile.NamedTemporaryFile('w', delete=False) as file: for host in CreateVulnerableHosts.execute( network, nodes_to_infect): file.write( TabSeparatedNodeSerializer.serialize(host) + '\n') # Then run a map/reduce job that marks some nodes as infected with CreateHitLists(args=['--size',str(hit_list_size), file.name])\ .make_runner() as runner: runner.run() with open(filename, 'w') as output: map(output.write, runner.stream_output()) finally: os.remove(file.name)