def create(filename, partitions, partitioner, fail_silently=False, tar_filename=None): """ Create a new partition. filename: input filename partitions: the number of partitions to create partitioner: a function from key to partition number fail_silently: when set, will swallow any exception encountered when opening the input file tar_filename: The filename used to create the partition archive """ make_filename = lambda partition: 'part-%05d' % partition make_path = lambda partition: '%s/%s' % (directory, make_filename(partition)) nodes = map(lambda _: [], xrange(partitions)) stream = None # Open the file, swallow if flag set try: stream = open(filename) except IOError: if not fail_silently: raise else: return None # For each line in the file, save it to its proper partition try: for line in stream: node = TabSeparatedNodeSerializer.deserialize(line) partition = partitioner(node.address) nodes[partition].append(node) finally: if stream: stream.close() # Create a temporary file directory = tempfile.mkdtemp() # For each partition, write its dataset for partition, partition_nodes in enumerate(nodes): stream = open(make_path(partition), "w") try: for node in sorted(partition_nodes, key=lambda n: n.address): stream.write(\ TabSeparatedNodeSerializer.serialize(node) + '\n') finally: if stream: stream.close() # Compress the resulting partitions tar_filename = tar_filename or tempfile.mktemp(prefix='partition', suffix='.tar.gz') tar = tarfile.open(tar_filename, "w:gz") try: map(lambda p: tar.add(make_path(p), make_filename(p)), xrange(partitions)) finally: tar.close() # Remove the scratch files map(lambda p: os.remove(make_path(p)), xrange(partitions)) return tar_filename
def partition_filename(self): """ Gets the partition filename associated with this reducer """ if "_partition_filename" not in self.__dict__ or \ self._partition_filename is None: self._partition_filename = \ PartitionUtilities.get_partition_filename( self.current_partition, self.options.partitions, lambda key: self.partition(key), lambda line: TabSeparatedNodeSerializer.deserialize(line)\ .address) return self._partition_filename
def partition_filename(self): """ Gets the partition filename associated with this reducer """ if "_partition_filename" not in self.__dict__ or \ self._partition_filename is None: self._partition_filename = \ PartitionUtilities.get_partition_filename( self.current_partition, self.options.partitions, lambda key: self.partition(key), lambda line: TabSeparatedNodeSerializer.deserialize(line)\ .address) return self._partition_filename