def test_chunk(self): from disco.core import classic_iterator url = 'http://discoproject.org/media/text/chekhov.txt' self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024) self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4) self.assert_(list(classic_iterator(['tag://disco:test:chunk'])), list(classic_iterator([url], reader=None))) self.ddfs.delete('disco:test:chunk')
def create(name, input): # move the existing dir to a backup dir dir = dirname(name) bck = bckname(name) if os.path.exists(dir): shutil.move(dir, bck) os.makedirs(dir) input_size = sum([util.result_size(url) for url in input]) partitions = 1 + (input_size / partition_size) # close enough with open(os.path.join(dir, 'partitions'), 'w') as file: file.write(str(partitions)) job = CreateDB().run( input = input, partitions = partitions, params = {'name':name, 'partitions':partitions} ) created = [key for key, value in classic_iterator(job.wait())] load(name) # successful - purge job and delete the backup dir job.purge() if os.path.exists(bck): shutil.rmtree(bck) return created
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in classic_iterator(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
def run(self, map, reduce, **jobargs): """Run a map-reduce job with either ``input_uri`` or ``output_uri`` as a "mongodb://..." URI. .. todo: parameter docs consider "input" and "output" (sans _uri) """ if not any(uri in jobargs for uri in ('input_uri', 'output_uri')): logging.info('You did not specify "input_uri" or "output_uri" ' 'with MongoJob. This may be in error.') if 'mongodb://' in jobargs.get('input_uri', ''): jobargs['map_input_stream'] = mongodb_input_stream if 'mongodb://' in jobargs.get('output_uri', ''): jobargs['reduce_output_stream'] = mongodb_output_stream jobargs['map'] = map jobargs['reduce'] = reduce jobargs.setdefault('input', calculate_splits(jobargs)) jobargs.setdefault('required_modules', []).extend([ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongodb_output', 'mongodisco.mongo_util', ]) super(MongoJob, self).run(**jobargs) if jobargs.get('print_to_stdout'): for key, value in classic_iterator(self.wait(show=True)): print key, value elif jobargs.get('job_wait', False): self.wait(show=True) return self
def chunk_iter(replicas): chunker = Chunker(chunk_size=chunk_size) return chunker.chunks(classic_iterator([replicas], **kwargs))
def run(self, map=None, reduce=None, **jobargs): """Run a map-reduce job with either ``input_uri`` or ``output_uri`` as a "mongodb://..." URI. .. todo: parameter docs consider "input" and "output" (sans _uri) """ if not any(uri in jobargs for uri in ('input_uri', 'output_uri', 'bson_input', 'bson_output')): logging.info('You did not specify "input_uri" or "output_uri" ' 'with MongoJob. This may be in error.') if 'mongodb://' in jobargs.get('input_uri', ''): jobargs['map_input_stream'] = mongodb_input_stream jobargs.setdefault('input', calculate_splits(jobargs)) elif jobargs.get('bson_input', False): jobargs['map_input_stream'] = bsonfile_input_stream if 'mongodb://' in jobargs.get('output_uri', ''): jobargs['reduce_output_stream'] = mongodb_output_stream output_params = { 'output_uri': jobargs['output_uri'], 'job_output_key': jobargs.get('job_output_key', '_id'), 'job_output_value': jobargs.get('job_output_value', 'value'), 'add_action': jobargs.get('add_action', 'insert'), 'add_upsert': jobargs.get('add_upsert', False), 'base_doc': jobargs.get('base_doc', {}) } params = jobargs.get('params', {}) if not isinstance(params, dict): raise Exception('params option must be a dict') params['mongodb'] = output_params jobargs['params'] = params elif jobargs.get('bson_output', False): jobargs['reduce_output_stream'] = bsonfile_output_stream if map: jobargs['map'] = map if reduce: jobargs['reduce'] = reduce jobargs.setdefault('required_modules', []).extend([ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongodb_output', 'mongodisco.mongo_util', 'mongodisco.bsonfile_io', 'mongodisco.bsonfile_input', 'mongodisco.bsonfile_output' ]) super(MongoJob, self).run(**jobargs) if jobargs.get('print_to_stdout'): for key, value in classic_iterator(self.wait(show=True)): print key, value elif jobargs.get('job_wait',False): self.wait(show=True) return self