def __init__(self, *args, **kwargs): # load the defaults super(Settings, self).update(defaults) # override with the settings file path = kwargs.get('settings_file') or self['settings_file'] if path and os.path.exists(path): try: import yaml self.update(yaml.load(open(path))) except: pass # if ya can't ya can't # final overrides super(Settings, self).update(overrides) super(Settings, self).__init__(*args, **kwargs) # set up ddfs and disco if not self['server'].startswith('disco://'): self['server'] = 'disco://' + self['server'] if 'ddfs' not in self: self['ddfs'] = DDFS(self['server']) self['server'] = Disco(self['server']) # set up worker if 'worker' not in self: worker_mod, _, worker_class = self['worker_class'].rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) self['worker'] = getattr(mod, worker_class)()
def get_disco_handle(server): from disco.core import Disco from disco.ddfs import DDFS if server and not server.startswith('disco://'): server = 'disco://' + server return Disco(server), DDFS(server)
def submit(master, jobpack): from disco.settings import DiscoSettings from disco.core import Disco settings = DiscoSettings() dmaster = Disco(master) print "Submitting job to ", master status, response = json.loads(dmaster.request('/disco/job/new', jobpack)) if status != 'ok': errmsg('Failed to start job. Server replied: %s' % response) print response
""" Predict the closest clusters for the datapoints in input. """ job = master.new_job(name='kcluster_predict', input=input, map_reader=map_reader, map=predict_map, params=Params(centers=centers, **center), nr_reduces=0) return job.wait() if __name__ == '__main__': parser = OptionParser(usage='%prog [options] inputs') parser.add_option('--disco-master', default=getenv('DISCO_MASTER'), help='Disco master') parser.add_option('--iterations', default=10, help='Numbers of iteration') parser.add_option('--clusters', default=10, help='Numbers of clusters') (options, input) = parser.parse_args() master = Disco(options.disco_master) centers = estimate(master, input, mean_point_center, int(options.clusters), int(options.iterations)) res = predict(master, input, mean_point_center, centers) print '\n'.join(res)
def disco(self): return Disco(self.disco_master_url)
def disco(self): from disco.core import Disco return Disco(self.settings['DISCO_MASTER'])
input = inputs or [ maybe_list(line.split()) for line in fileinput.input(inputs) ] job = reify(jobclass)(program.disco, name) try: params = job.params except AttributeError: params = Params() params.__dict__.update(**dict(program.options.params)) job.run(input=input, **program.option_parser.jobdict) print job.name @Disco.command def wait(program, jobname): """Usage: jobname Wait for the named job to complete. """ program.disco.wait(jobname) if __name__ == '__main__': Disco(option_parser=DiscoOptionParser()).main() # Workaround for "disco test" in Python2.5 which doesn't shutdown the # test_server thread properly. sys.exit(0) # XXX still needed?
def disco(self): from disco.core import Disco return Disco(settings=self.settings)
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(line, params): for word in line.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job( name="wordcount", input=["http://discoproject.org/media/text/chekhov.txt"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" for word, count in result_iterator(results): print word, count
def __init__(self, name=None, master=None, worker=None, settings=None): from disco.core import Disco self.name = name or type(self).__name__ self.disco = master if isinstance(master, Disco) else Disco(master) self.worker = worker or self.Worker() self.settings = settings or DiscoSettings()
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau**2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900. + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append( _fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append( _fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
def disco(self): return Disco(settings=self.settings)
from discodex import settings from discodex.mapreduce import (Indexer, DiscoDBIterator) from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict) from disco.core import Disco from disco.ddfs import DDFS from disco.error import DiscoError from disco.util import flatten, parse_dir discodex_settings = settings.DiscodexSettings() disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER'] disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX'] index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX'] purge_file = discodex_settings['DISCODEX_PURGE_FILE'] disco_master = Disco(disco_master_url) ddfs = DDFS(disco_master_url) NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead' class IndexCollection(Collection): allowed_methods = ('GET', 'POST') def delegate(self, request, *args, **kwargs): name = str(kwargs.pop('name')) return IndexResource(name)(request, *args, **kwargs) @property def names(self): return ddfs.list(index_prefix)
2. Online ODAT; 3. Offline dim') parser.add_option('--post-fix', default=1, help='Does post-fixing for ODAT? (default=1): 1. Yes; 2. No') parser.add_option('--go-live', default=1, help='Load offline dim data to DW DBMS? (default=1): 1. yes; 2. No') parser.add_option('--profile', default=False, help='Profile (default=False)') parser.add_option('--config', default='conf/config.py', help='The path to config.py (default=conf/config.py)') (options, input_paths) = parser.parse_args() master = Disco("disco://"+options.disco_master) load_method = odotetlmr seq_process = None post_fixing = -1 load_step = int(options.load_step) if options.load_method=='2': load_method = odatetlmr if load_step==1: post_fixing = int(options.post_fix) seq_process = multiprocessing.Process(target=seq_server) seq_process.start() elif options.load_method=='3': load_method = offdimetlmr input_file_urls = []
def data(): return Disco(self.master).jobpack(self.jobname)